public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl) : base(crawlContext) { if (pageToCrawl == null) throw new ArgumentNullException("pageToCrawl"); PageToCrawl = pageToCrawl; }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if(pageToCrawl == null) return new CrawlDecision { Allow = false, Reason = "Null page to crawl" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) }; if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" }; //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)? if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 && crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }; } int pagesCrawledInThisDomain = 0; if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) }; } if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; return new CrawlDecision { Allow = true }; }
public void SetUp() { _unitUnderTest = GetInstance(); _page1 = new PageToCrawl(new Uri("http://a.com")); _page2 = new PageToCrawl(new Uri("http://b.com")); }
public void Add(PageToCrawl page) { _urlQueue.Enqueue(page); }
void PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; Log.Logger.Error($"Did not crawl page {pageToCrawl.Uri.AbsoluteUri} due to {e.DisallowedReason}"); }
static void Crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; _log.Info($"Did not crawl page {pageToCrawl.Uri.AbsoluteUri} due to {e.DisallowedReason}"); }
/// <summary> /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available /// </summary> public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { if (uri == null) { throw new ArgumentNullException("uri"); } _crawlContext.RootUri = uri; if (cancellationTokenSource != null) { _crawlContext.CancellationTokenSource = cancellationTokenSource; } _crawlResult = new CrawlResult(); _crawlResult.RootUri = _crawlContext.RootUri; _crawlResult.CrawlContext = _crawlContext; _crawlComplete = false; _logger.InfoFormat("About to crawl site [{0}]", uri.AbsoluteUri); PrintConfigValues(_crawlContext.CrawlConfiguration); if (_memoryManager != null) { _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb(); _logger.InfoFormat("Starting memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageBeforeCrawlInMb); } _crawlContext.CrawlStartDate = DateTime.Now; Stopwatch timer = Stopwatch.StartNew(); if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0) { _timeoutTimer = new Timer(_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000); _timeoutTimer.Elapsed += HandleCrawlTimeout; _timeoutTimer.Start(); } try { PageToCrawl rootPage = new PageToCrawl(uri) { ParentUri = uri, IsInternal = true, IsRoot = true }; if (ShouldSchedulePageLink(rootPage)) { _scheduler.Add(rootPage); } VerifyRequiredAvailableMemory(); CrawlSite(); } catch (Exception e) { _crawlResult.ErrorException = e; _logger.FatalFormat("An error occurred while crawling site [{0}]", uri); _logger.Fatal(e); } finally { if (_threadManager != null) { _threadManager.Dispose(); } } if (_timeoutTimer != null) { _timeoutTimer.Stop(); } timer.Stop(); if (_memoryManager != null) { _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb(); _logger.InfoFormat("Ending memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageAfterCrawlInMb); } _crawlResult.Elapsed = timer.Elapsed; _logger.InfoFormat("Crawl complete for site [{0}]: Crawled [{1}] pages in [{2}]", _crawlResult.RootUri.AbsoluteUri, _crawlResult.CrawlContext.CrawledCount, _crawlResult.Elapsed); return(_crawlResult); }
//抓取页面失败 public static void Disallowed(object sender, PageCrawlDisallowedArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; Outputer.Output(string.Format("由于产生错误:{1} 无法抓取页面{0}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason)); }
protected virtual void WaitMinimumRetryDelay(PageToCrawl pageToCrawl) { //TODO No unit tests cover these lines if (pageToCrawl.LastRequest == null) { _logger.WarnFormat("pageToCrawl.LastRequest value is null for Url:{0}. Cannot retry without this value.", pageToCrawl.Uri.AbsoluteUri); return; } double milliSinceLastRequest = (DateTime.Now - pageToCrawl.LastRequest.Value).TotalMilliseconds; double milliToWait; if (pageToCrawl.RetryAfter.HasValue) { // Use the time to wait provided by the server instead of the config, if any. milliToWait = pageToCrawl.RetryAfter.Value*1000 - milliSinceLastRequest; } else { if (!(milliSinceLastRequest < _crawlContext.CrawlConfiguration.MinRetryDelayInMilliseconds)) return; milliToWait = _crawlContext.CrawlConfiguration.MinRetryDelayInMilliseconds - milliSinceLastRequest; } _logger.InfoFormat("Waiting [{0}] milliseconds before retrying Url:[{1}] LastRequest:[{2}] SoonestNextRequest:[{3}]", milliToWait, pageToCrawl.Uri.AbsoluteUri, pageToCrawl.LastRequest, pageToCrawl.LastRequest.Value.AddMilliseconds(_crawlContext.CrawlConfiguration.MinRetryDelayInMilliseconds)); //TODO Cannot use RateLimiter since it currently cannot handle dynamic sleep times so using Thread.Sleep in the meantime if (milliToWait > 0) Thread.Sleep(TimeSpan.FromMilliseconds(milliToWait)); }
//protected virtual async Task ProcessPage(PageToCrawl pageToCrawl) protected virtual void ProcessPage(PageToCrawl pageToCrawl) { try { if (pageToCrawl == null) return; ThrowIfCancellationRequested(); AddPageToContext(pageToCrawl); //CrawledPage crawledPage = await CrawlThePage(pageToCrawl); CrawledPage crawledPage = CrawlThePage(pageToCrawl); // Validate the root uri in case of a redirection. if (crawledPage.IsRoot) ValidateRootUriForRedirection(crawledPage); if (!IsRedirect(crawledPage) || (IsRedirect(crawledPage) && _crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled && !_scheduler.IsUriKnown(crawledPage.HttpWebResponse.ResponseUri))) { if (PageSizeIsAboveMax(crawledPage)) return; ThrowIfCancellationRequested(); bool shouldCrawlPageLinks = ShouldCrawlPageLinks(crawledPage); if (shouldCrawlPageLinks || _crawlContext.CrawlConfiguration.IsForcedLinkParsingEnabled) ParsePageLinks(crawledPage); ThrowIfCancellationRequested(); if (shouldCrawlPageLinks) SchedulePageLinks(crawledPage); ThrowIfCancellationRequested(); FirePageCrawlCompletedEventAsync(crawledPage); FirePageCrawlCompletedEvent(crawledPage); if (ShouldRecrawlPage(crawledPage)) { crawledPage.IsRetry = true; _scheduler.Add(crawledPage); } if (IsRedirect(crawledPage) && _crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled) { _scheduler.AddKnownUri(crawledPage.HttpWebResponse.ResponseUri); } } else if (IsRedirect(crawledPage) && !_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled) { ProcessRedirect(crawledPage); } } catch (OperationCanceledException oce) { _logger.DebugFormat("Thread cancelled while crawling/processing page [{0}]", pageToCrawl.Uri); throw; } catch (Exception e) { _crawlResult.ErrorException = e; _logger.FatalFormat("Error occurred during processing of page [{0}]", pageToCrawl.Uri); _logger.Fatal(e); _crawlContext.IsCrawlHardStopRequested = true; } }
protected virtual void ProcessRedirect(CrawledPage crawledPage) { if (crawledPage.RedirectPosition >= 20) _logger.WarnFormat("Page [{0}] is part of a chain of 20 or more consecutive redirects, redirects for this chain will now be aborted.", crawledPage.Uri); try { var uri = ExtractRedirectUri(crawledPage); PageToCrawl page = new PageToCrawl(uri); page.ParentUri = crawledPage.ParentUri; page.CrawlDepth = crawledPage.CrawlDepth; page.IsInternal = IsInternalUri(uri); page.IsRoot = false; page.RedirectedFrom = crawledPage; page.RedirectPosition = crawledPage.RedirectPosition + 1; crawledPage.RedirectedTo = page; _logger.DebugFormat("Page [{0}] is requesting that it be redirect to [{1}]", crawledPage.Uri, crawledPage.RedirectedTo.Uri); if (ShouldSchedulePageLink(page)) { if (_scheduler.IsUriKnown(uri)) { _logger.InfoFormat("Page [{0}] is redirected to [{1}], which is a page already crawled.", crawledPage.Uri, crawledPage.RedirectedTo.Uri); } else { _logger.InfoFormat("Page [{0}] will be redirect to [{1}]", crawledPage.Uri, crawledPage.RedirectedTo.Uri); _scheduler.Add(page); } } } catch {} }
protected virtual void FirePageCrawlStartingEvent(PageToCrawl pageToCrawl) { try { EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStarting; if (threadSafeEvent != null) threadSafeEvent(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl)); } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the PageCrawlStarting event for url:" + pageToCrawl.Uri.AbsoluteUri); _logger.Error(e); } }
protected virtual void FirePageCrawlStartingEventAsync(PageToCrawl pageToCrawl) { EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStartingAsync; if (threadSafeEvent != null) { //Fire each subscribers delegate async foreach (EventHandler<PageCrawlStartingArgs> del in threadSafeEvent.GetInvocationList()) { del.BeginInvoke(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl), null, null); } } }
//protected virtual async Task<CrawledPage> CrawlThePage(PageToCrawl pageToCrawl) protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl) { _logger.DebugFormat("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri); FirePageCrawlStartingEventAsync(pageToCrawl); FirePageCrawlStartingEvent(pageToCrawl); if (pageToCrawl.IsRetry){ WaitMinimumRetryDelay(pageToCrawl); } pageToCrawl.LastRequest = DateTime.Now; CrawledPage crawledPage = _pageRequester.MakeRequest(pageToCrawl.Uri, ShouldDownloadPageContent); //CrawledPage crawledPage = await _pageRequester.MakeRequestAsync(pageToCrawl.Uri, ShouldDownloadPageContent); dynamic combinedPageBag = this.CombinePageBags(pageToCrawl.PageBag, crawledPage.PageBag); Mapper.CreateMap<PageToCrawl, CrawledPage>(); Mapper.Map(pageToCrawl, crawledPage); crawledPage.PageBag = combinedPageBag; if (crawledPage.HttpWebResponse == null) _logger.InfoFormat("Page crawl complete, Status:[NA] Url:[{0}] Elapsed:[{1}] Parent:[{2}] Retry:[{3}]", crawledPage.Uri.AbsoluteUri, crawledPage.Elapsed, crawledPage.ParentUri, crawledPage.RetryCount); else _logger.InfoFormat("Page crawl complete, Status:[{0}] Url:[{1}] Elapsed:[{2}] Parent:[{3}] Retry:[{4}]", Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.Elapsed, crawledPage.ParentUri, crawledPage.RetryCount); return crawledPage; }
protected virtual void AddPageToContext(PageToCrawl pageToCrawl) { if (pageToCrawl.IsRetry) { pageToCrawl.RetryCount++; return; } int domainCount = 0; Interlocked.Increment(ref _crawlContext.CrawledCount); lock (_crawlContext.CrawlCountByDomain) { if (_crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out domainCount)) _crawlContext.CrawlCountByDomain[pageToCrawl.Uri.Authority] = domainCount + 1; else _crawlContext.CrawlCountByDomain.TryAdd(pageToCrawl.Uri.Authority, 1); } }
/// <summary> /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available /// </summary> public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { if (uri == null) throw new ArgumentNullException("uri"); _crawlContext.RootUri = _crawlContext.OriginalRootUri = uri; if (cancellationTokenSource != null) _crawlContext.CancellationTokenSource = cancellationTokenSource; _crawlResult = new CrawlResult(); _crawlResult.RootUri = _crawlContext.RootUri; _crawlResult.CrawlContext = _crawlContext; _crawlComplete = false; _logger.InfoFormat("About to crawl site [{0}]", uri.AbsoluteUri); PrintConfigValues(_crawlContext.CrawlConfiguration); if (_memoryManager != null) { _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb(); _logger.InfoFormat("Starting memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageBeforeCrawlInMb); } _crawlContext.CrawlStartDate = DateTime.Now; Stopwatch timer = Stopwatch.StartNew(); if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0) { _timeoutTimer = new Timer(_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000); _timeoutTimer.Elapsed += HandleCrawlTimeout; _timeoutTimer.Start(); } try { PageToCrawl rootPage = new PageToCrawl(uri) { ParentUri = uri, IsInternal = true, IsRoot = true }; if (ShouldSchedulePageLink(rootPage)) _scheduler.Add(rootPage); VerifyRequiredAvailableMemory(); CrawlSite(); } catch (Exception e) { _crawlResult.ErrorException = e; _logger.FatalFormat("An error occurred while crawling site [{0}]", uri); _logger.Fatal(e); } finally { if (_threadManager != null) _threadManager.Dispose(); } if (_timeoutTimer != null) _timeoutTimer.Stop(); timer.Stop(); if (_memoryManager != null) { _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb(); _logger.InfoFormat("Ending memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageAfterCrawlInMb); } _crawlResult.Elapsed = timer.Elapsed; _logger.InfoFormat("Crawl complete for site [{0}]: Crawled [{1}] pages in [{2}]", _crawlResult.RootUri.AbsoluteUri, _crawlResult.CrawlContext.CrawledCount, _crawlResult.Elapsed); return _crawlResult; }
static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; Console.WriteLine("About to crawl link {0} which was found on page {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri); }
public override CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { return(base.ShouldCrawlPage(pageToCrawl, crawlContext)); }
private void _crawler_PageCrawlStartingAsync(object sender, PageCrawlStartingArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; }
public IEnumerable <PageToCrawl> GetLinks(CrawledPage crawledPage) { List <PageToCrawl> pages = new List <PageToCrawl>(); List <string> rules = GetDynamicData <List <string> >(crawledPage.PageBag, "Rules"); if (rules == null) { rules = _thinkCrawlConfiguration.StartRule.GetRules(); } var doc = crawledPage.HtmlDocument; foreach (var ruleName in rules) { var rule = _thinkCrawlConfiguration.GetRule(ruleName); if (rule == null) { continue; } string xpath = rule.XPath; if (string.IsNullOrEmpty(xpath) || !rule.NeedCrawl) { continue; } #region add link bool donotFilter = string.IsNullOrEmpty(rule.FilterText); var links = doc.DocumentNode.SelectNodes(xpath); if (links == null || links.Count < 1) { continue; } foreach (var a in links) { string text = StringUtil.RemoveHTML(a.InnerText); string url = FormatUrl(a.Attributes["href"].Value ?? "", crawledPage.Uri); if (string.IsNullOrEmpty(url)) { continue; } if (donotFilter || text == rule.FilterText) { _logger.LogInformation("文本:[{0}] , URL: {1}", text, url); var uri = new Uri(url); PageToCrawl page = new PageToCrawl(uri) { ParentUri = crawledPage.Uri, CrawlDepth = crawledPage.CrawlDepth + 1, IsInternal = uri.Authority == crawledPage.Uri.Authority, IsRoot = false }; page.PageBag.Rules = rule.GetNextRules(); pages.Add(page); } } #endregion } return(pages); }
protected virtual void SchedulePageLinks(CrawledPage crawledPage) { foreach (Uri uri in crawledPage.ParsedLinks) { // First validate that the link was not already visited or added to the list of pages to visit, so we don't // make the same validation and fire the same events twice. if (!_scheduler.IsUriKnown(uri) && (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) { try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed) { PageToCrawl page = new PageToCrawl(uri); page.ParentUri = crawledPage.Uri; page.CrawlDepth = crawledPage.CrawlDepth + 1; page.IsInternal = IsInternalUri(uri); page.IsRoot = false; if (ShouldSchedulePageLink(page)) { _scheduler.Add(page); } } catch { } } // Add this link to the list of known Urls so validations are not duplicated in the future. _scheduler.AddKnownUri(uri); } }
protected virtual bool ShouldSchedulePageLink(PageToCrawl page) { if ((page.IsInternal || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled) && (ShouldCrawlPage(page))) return true; return false; }
public void Add_NullPage() { PageToCrawl nullPage = null; _unitUnderTest.Add(nullPage); }
protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl) { CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext); if (shouldCrawlPageDecision.Allow) shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : new CrawlDecision { Allow = true }; if (!shouldCrawlPageDecision.Allow) { _logger.DebugFormat("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason); } SignalCrawlStopIfNeeded(shouldCrawlPageDecision); return shouldCrawlPageDecision.Allow; }
static void Crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; _log.Info($"About to crawl link {pageToCrawl.Uri.AbsoluteUri} which was found on page {pageToCrawl.ParentUri.AbsoluteUri}"); }
private void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; var result = string.Format("About to crawl link {0} which was found on page {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri); }
protected virtual async Task ProcessPage(PageToCrawl pageToCrawl) { lock (_processingPageCountLock) { _processingPageCount++; Log.Debug($"Incrementing processingPageCount to [{_processingPageCount}]"); } try { if (pageToCrawl == null) { return; } ThrowIfCancellationRequested(); AddPageToContext(pageToCrawl); var crawledPage = await CrawlThePage(pageToCrawl).ConfigureAwait(false); // Validate the root uri in case of a redirection. if (crawledPage.IsRoot) { ValidateRootUriForRedirection(crawledPage); } if (IsRedirect(crawledPage) && !_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled) { ProcessRedirect(crawledPage); } if (PageSizeIsAboveMax(crawledPage)) { return; } ThrowIfCancellationRequested(); var shouldCrawlPageLinks = ShouldCrawlPageLinks(crawledPage); if (shouldCrawlPageLinks || _crawlContext.CrawlConfiguration.IsForcedLinkParsingEnabled) { ParsePageLinks(crawledPage); } ThrowIfCancellationRequested(); if (shouldCrawlPageLinks) { SchedulePageLinks(crawledPage); } ThrowIfCancellationRequested(); FirePageCrawlCompletedEvent(crawledPage); if (ShouldRecrawlPage(crawledPage)) { crawledPage.IsRetry = true; _scheduler.Add(crawledPage); } } catch (OperationCanceledException) { Log.Debug("Thread cancelled while crawling/processing page [{0}]", pageToCrawl.Uri); throw; } catch (Exception e) { _crawlResult.ErrorException = e; Log.Fatal("Error occurred during processing of page [{0}]", pageToCrawl.Uri); Log.Fatal(e, "Exception details -->"); _crawlContext.IsCrawlHardStopRequested = true; } finally { lock (_processingPageCountLock) { _processingPageCount--; Log.Debug($"Decrementing processingPageCount to [{_processingPageCount}]"); } } }
private void crawler_CrawlerStart(object sender, PageCrawlStartingArgs e) { PageToCrawl page = e.PageToCrawl; Console.WriteLine("Starting with {0}", page.Uri.ToString()); }
//protected virtual async Task ProcessPage(PageToCrawl pageToCrawl) protected virtual void ProcessPage(PageToCrawl pageToCrawl) { try { if (pageToCrawl == null) { return; } ThrowIfCancellationRequested(); AddPageToContext(pageToCrawl); //CrawledPage crawledPage = await CrawlThePage(pageToCrawl); CrawledPage crawledPage = CrawlThePage(pageToCrawl); if (IsRedirect(crawledPage)) { ProcessRedirect(crawledPage); } if (PageSizeIsAboveMax(crawledPage)) { return; } ThrowIfCancellationRequested(); bool shouldCrawlPageLinks = ShouldCrawlPageLinks(crawledPage); if (shouldCrawlPageLinks || _crawlContext.CrawlConfiguration.IsForcedLinkParsingEnabled) { ParsePageLinks(crawledPage); } ThrowIfCancellationRequested(); if (shouldCrawlPageLinks) { SchedulePageLinks(crawledPage); } ThrowIfCancellationRequested(); FirePageCrawlCompletedEventAsync(crawledPage); FirePageCrawlCompletedEvent(crawledPage); if (ShouldRecrawlPage(crawledPage)) { crawledPage.IsRetry = true; _scheduler.Add(crawledPage); } } catch (OperationCanceledException oce) { _logger.DebugFormat("Thread cancelled while crawling/processing page [{0}]", pageToCrawl.Uri); throw; } catch (Exception e) { _crawlResult.ErrorException = e; _logger.FatalFormat("Error occurred during processing of page [{0}]", pageToCrawl.Uri); _logger.Fatal(e); _crawlContext.IsCrawlHardStopRequested = true; } }
private void crawler_CrawlerDisalowed(object sender, PageCrawlDisallowedArgs e) { PageToCrawl page = e.PageToCrawl; Console.WriteLine("Disallowed: {0}", page.Uri.ToString()); }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl == null) { return new CrawlDecision { Allow = false, Reason = "Null page to crawl" } } ; if (crawlContext == null) { return new CrawlDecision { Allow = false, Reason = "Null crawl context" } } ; if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) { return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) } } ; if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) { return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" } } ; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) { return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" } } ; //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)? if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 && crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return(new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }); } int pagesCrawledInThisDomain = 0; if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) { return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) } } ; } if (!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) { return new CrawlDecision { Allow = false, Reason = "Link is external" } } ; return(new CrawlDecision { Allow = true }); }
/// <summary> /// 不爬取这个页面的原因 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; log.Info("不爬取此页面 " + pageToCrawl.Uri.AbsoluteUri + " 其原因为 " + e.DisallowedReason); }
void ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; Log.Logger.Debug($"About to crawl link {pageToCrawl.Uri.AbsoluteUri} which was found on page {pageToCrawl.ParentUri.AbsoluteUri}"); }
/// <summary> /// 设置爬虫开始爬行 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; log.Info("要爬取的链接 " + pageToCrawl.Uri.AbsoluteUri + " 在页面 " + pageToCrawl.ParentUri.AbsoluteUri); }
void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; Console.WriteLine("Did not crawl page {0} due to {1}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason); }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl == null) { return new CrawlDecision { Allow = false, Reason = "Null page to crawl" } } ; if (crawlContext == null) { return new CrawlDecision { Allow = false, Reason = "Null crawl context" } } ; if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) { return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" } } ; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) { return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" } } ; if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 && crawlContext.CrawledCount + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return(new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }); } int pagesCrawledInThisDomain = 0; if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) { return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) } } ; } if (!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) { return new CrawlDecision { Allow = false, Reason = "Link is external" } } ; return(new CrawlDecision { Allow = true }); }
//开始抓取 public static void Starting(object sender, PageCrawlStartingArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; Outputer.Output(string.Format("关于抓取页面 {0} 上找到的链接 {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri)); }
protected virtual List <PageToCrawl> GetCrawlPages(CrawledPage crawledPage, IEnumerable <string> hrefValues) { List <PageToCrawl> pages = new List <PageToCrawl>(); if (hrefValues == null || hrefValues.Count() < 1) { return(pages); } //Use the uri of the page that actually responded to the request instead of crawledPage.Uri (Issue 82). //Using HttpWebRequest.Address instead of HttpWebResonse.ResponseUri since this is the best practice and mentioned on http://msdn.microsoft.com/en-us/library/system.net.httpwebresponse.responseuri.aspx Uri uriToUse = crawledPage.Uri; //If html base tag exists use it instead of page uri for relative links string baseHref = GetBaseHrefValue(crawledPage); if (!string.IsNullOrEmpty(baseHref)) { if (baseHref.StartsWith("//")) { baseHref = crawledPage.Uri.Scheme + ":" + baseHref; } try { uriToUse = new Uri(baseHref); } catch { } } string href = ""; foreach (string hrefValue in hrefValues) { try { // Remove the url fragment part of the url if needed. // This is the part after the # and is often not useful. href = hrefValue.Split('#')[0]; Uri newUri = new Uri(uriToUse, href); if (_cleanURLFunc != null) { newUri = new Uri(_cleanURLFunc(newUri.AbsoluteUri)); } if (!pages.Exists(u => u.Uri.AbsoluteUri == newUri.AbsoluteUri)) { PageToCrawl page = new PageToCrawl(newUri) { ParentUri = crawledPage.Uri, CrawlDepth = crawledPage.CrawlDepth + 1, IsInternal = newUri.Authority == crawledPage.Uri.Authority, IsRoot = false }; pages.Add(page); } } catch (Exception e) { _logger.LogDebug("Could not parse link [{0}] on page [{1}]", hrefValue, crawledPage.Uri); _logger.LogError(e, e.Message); } } return(pages); }