public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_IsRetry_ReturnsTrue() { Uri uri = new Uri("http://a.com/"); CrawlConfiguration config = new CrawlConfiguration { MaxPagesToCrawlPerDomain = 100 }; ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>(); countByDomain.TryAdd(uri.Authority, 100); CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = config, CrawlStartDate = DateTime.Now, CrawlCountByDomain = countByDomain }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage")) { IsRetry = true, IsInternal = true }, crawlContext); Assert.IsTrue(result.Allow); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
public CrawlArgs(CrawlContext crawlContext) { if (crawlContext == null) throw new ArgumentNullException("crawlContext"); CrawlContext = crawlContext; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled); _crawlContext.Scheduler = _scheduler; }
public void Consume_ValidDomain_CrawlerCrawlBagSet() { //Arrange Domain domain = new Domain { DomainId = 1, Uri = new Uri("http://a.com") }; CrawlContext context = GetCrawlContext(_dummyCrawlProcessors); CrawlResult fakeResult = new CrawlResult { CrawlContext = context }; _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); //Act DomainCrawlResult result = _uut.Consume(domain, _dummyCancellationToken); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); Assert.AreEqual(domain, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.Domain); Assert.AreEqual(_dummyProcessorContext.PrimaryPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.PrimaryPersistenceProvider); Assert.AreEqual(_dummyProcessorContext.BackupPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.BackupPersistenceProvider); Assert.AreEqual(_dummyCrawlProcessors, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.CrawlProcessors); }
private static Crawler CreateCrawler(CrawlContext context) { var crawler = new Crawler(context.Sequence); crawler.PauseInterval = context.Pause; if (context.QueryType == QueryType.Name) { ExtraSetupForNameBasedCrawler(crawler, context); } crawler.AfterCrawl += (sender, e) => { var progress = e.Progress; var message = string.Format(GetMessageFormat(context.MaximumTry), progress.Current, progress.Total, context.CancelRate, progress.CurrentKeyword, progress.Message); if (e.Error == null) { PowerConsole.Info(message); } else { PowerConsole.Error(message); } }; return crawler; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }
protected override ProcessorResult ProcessPage(CrawlContext crawlContext, CrawledPage crawledPage) { ProcessorResult result = new ProcessorResult { UniqueAttributeId = 222 }; Match regexResult = wordPressPattern.Match(crawledPage.RawContent); if (regexResult.Success) { result.Attributes.Add("siteBuilder", "BlogWordPress"); result.IsAHit = true; return(result); } HtmlNodeCollection listhref = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null); if (listhref.Select(node => node.GetAttributeValue("href", "")).Any(content => content.Contains("wordpress.org"))) { result.Attributes.Add("siteBuilder", "BlogWordPress"); result.IsAHit = true; return(result); } return(result); }
protected override ProcessorResult ProcessPage(CrawlContext crawlContext, CrawledPage crawledPage) { nodeQueryList.Add(new KeyValuePair <string, string>("login", "//a[contains(@href, 'login')]")); nodeQueryList.Add(new KeyValuePair <string, string>("signin", "//a[contains(@href, 'signin')]")); ProcessorResult result = new ProcessorResult { UniqueAttributeId = 17 }; //<input type="password" var pwdInputs = crawledPage.CsQueryDocument.Select("input[type='password']"); if (pwdInputs.Length > 0) { result.IsAHit = true; } //check links if (!result.IsAHit) { result.IsAHit = FindTags(crawledPage, crawlContext.RootUri.DnsSafeHost.ToLower()); } //if we found it, set it if (result.IsAHit) { result.Attributes.Add(result.UniqueAttributeId.ToString(), "true"); } return(result); }
public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_ReturnsFalse() { Uri uri = new Uri("http://a.com/"); CrawlConfiguration config = new CrawlConfiguration { MaxPagesToCrawlPerDomain = 100 }; ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>(); countByDomain.TryAdd(uri.Authority, 100); CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = config, CrawlStartDate = DateTime.Now, CrawlCountByDomain = countByDomain }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage")) { IsInternal = true }, crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("MaxPagesToCrawlPerDomain limit of [100] has been reached for domain [a.com]", result.Reason); Assert.IsFalse(crawlContext.IsCrawlStopRequested); }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="htmlParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHtmlParser htmlParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext { CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration() }; CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor()); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _htmlParser = htmlParser ?? new AngleSharpHyperlinkParser(_crawlContext.CrawlConfiguration, null); _crawlContext.Scheduler = _scheduler; }
public void ProcessCrawledDomain(CrawlContext crawlContext) { string webhost = string.Empty; try { webhost = Dig.Instance.GetWebHostName(crawlContext.RootUri.DnsSafeHost); } catch (Exception e) { _logger.ErrorFormat("Exception occurred getting webhost name for [{0}]", crawlContext.RootUri.DnsSafeHost, e); } ProcessorResult result = new ProcessorResult { UniqueAttributeId = ATTRIB_TYPE_ID }; //mask result.IsAHit = webhost != "None"; result.Attributes.Add(ATTRIB_TYPE_ID.ToString(), webhost); if (result.IsAHit) { DomainSave(crawlContext, result); } }
public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) { return(CrawlDecision.DisallowCrawl("Null crawled page")); } if (crawlContext == null) { return(CrawlDecision.DisallowCrawl("Null crawl context")); } if (crawledPage.Exception == null) { return(CrawlDecision.DisallowCrawl("WebException did not occur")); } if (crawlContext.CrawlConfiguration.MaxRetryCount < 1) { return(CrawlDecision.AllowCrawl("无限次重试")); } if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount) { return(CrawlDecision.DisallowCrawl("MaxRetryCount has been reached")); } return(CrawlDecision.AllowCrawl()); }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl == null) { return(CrawlDecision.DisallowCrawl("Null crawled page")); } if (crawlContext == null) { return(CrawlDecision.DisallowCrawl("Null crawl context")); } if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) { return(CrawlDecision.DisallowCrawl("Crawl depth is above max")); } if (!pageToCrawl.Uri.Scheme.StartsWith("http")) { return(CrawlDecision.DisallowCrawl("Scheme does not begin with http")); } //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)? if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 && crawlContext.CrawledCount > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return(CrawlDecision.DisallowCrawl(string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl))); } return(CrawlDecision.AllowCrawl()); }
protected void DomainSave(CrawlContext crawlContext, ProcessorResult processorResult) { ProcessorContext processorContext = crawlContext.CrawlBag.GoDaddyProcessorContext; DataComponent dataComponent = new DataComponent { ShopperId = processorContext.Domain.ShopperId, AttributeId = processorResult.UniqueAttributeId, DomainId = processorContext.Domain.DomainId, Attributes = processorResult.Attributes, DomainUri = crawlContext.RootUri, FoundOnUri = null }; try { processorContext.PrimaryPersistenceProvider.Save(dataComponent); } catch (Exception e) { _logger.ErrorFormat("Error while trying to save domain level data to primary IPersistenceProvider [{0}], will save to backup IPersistenceProvider [{1}]." , processorContext.PrimaryPersistenceProvider.ToString() , processorContext.BackupPersistenceProvider.ToString()); _logger.Error(e); processorContext.BackupPersistenceProvider.Save(dataComponent); } }
private CrawlDecision ShouldCrawlPageContent(CrawledPage page, CrawlContext context) { var result = new CrawlDecision(); if (page.Uri.ToString().Contains("product") || //page.Uri.ToString().Contains("lenovo") || //page.Uri.ToString().Contains("laptop") || page.Uri.ToString().Contains("productVariantGroup") || page.Uri.ToString().Contains("-pc")) { result.Allow = true; if (page.Uri.ToString().Contains("-pch")) { result.Reason = "Not a product"; result.Allow = false; } } else { result.Reason = "Not a product"; result.Allow = false; } return(result); }
public static void Main(string[] args) { try { string sequenceType = args[0]; string start = args[1]; long max = long.Parse(args[2]); int pause = int.Parse(args[3]); // Pause interval if (args.Length < 4) { throw new ArgumentException("Invalid number of arguments!"); } CrawlContext context = GetCrawlContext(sequenceType, start, max, pause); Crawler crawler = CreateCrawler(context); crawler.Crawl(max, context.QueryType); } catch (Exception e) { Console.WriteLine(e); Trace.TraceError(e.ToString()); } }
private const string WEBSITE = "http://volarenovels.com/release-that-witch/"; //include http:// private static CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl.Uri.AbsoluteUri.Contains("volarenovels.com/release-that-witch/")) { String segment = pageToCrawl.Uri.Segments[pageToCrawl.Uri.Segments.Length - 1]; if (segment.Contains("rw-chapter-")) { segment = segment.Replace("/", "").Substring("rw-chapter-".Length); if (Convert.ToInt32(segment) > 186) { return(new CrawlDecision { Allow = true, Reason = "Is a chapter" }); } } } if (pageToCrawl.Uri.AbsoluteUri == ("http://volarenovels.com/release-that-witch/")) { return new CrawlDecision { Allow = true, Reason = "Is content" } } ; return(new CrawlDecision { Allow = false, Reason = "Is not a chapter" }); }
public void SetUp() { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = new CrawlConfiguration { UserAgentString = "aaa" }; _unitUnderTest = new CrawlDecisionMaker(); }
public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl) : base(crawlContext) { if (pageToCrawl == null) throw new ArgumentNullException("pageToCrawl"); PageToCrawl = pageToCrawl; }
public void Constructor_ValidArg_SetsPublicProperty() { CrawledPage page = new CrawledPage(new Uri("http://aaa.com/")); CrawlContext context = new CrawlContext(); CrawlArgs args = new CrawlArgs(context); Assert.AreSame(context, args.CrawlContext); }
public CrawlArgs(CrawlContext crawlContext) { if (crawlContext == null) { throw new ArgumentNullException("crawlContext"); } CrawlContext = crawlContext; }
static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) { string childUrl = e.PageToCrawl.Uri.AbsoluteUri; string parentUrl = e.PageToCrawl.ParentUri.AbsoluteUri; CrawlContext context = e.CrawlContext; CrawledLinks crawledLinks = context.CrawlBag.CrawledLinks; crawledLinks.AddRelation(parentUrl, childUrl); }
public CrawlEventArgs(CrawlContext crawlContext) { if (crawlContext == null) { throw new ArgumentNullException(nameof(crawlContext)); } CrawlContext = crawlContext; }
public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl) : base(crawlContext) { if (pageToCrawl == null) { throw new ArgumentNullException("pageToCrawl"); } PageToCrawl = pageToCrawl; }
public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason) : base(crawlContext, pageToCrawl) { if (string.IsNullOrWhiteSpace(disallowedReason)) { throw new ArgumentNullException(nameof(disallowedReason)); } DisallowedReason = disallowedReason; }
public PageLinksCrawlDisallowedArgs(CrawlContext crawlContext, CrawledPage crawledPage, string disallowedReason) : base(crawlContext, crawledPage) { if (string.IsNullOrWhiteSpace(disallowedReason)) { throw new ArgumentNullException("disallowedReason"); } DisallowedReason = disallowedReason; }
public void SetUp() { _fakeScheduler = new Mock <IScheduler>(); _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = new CrawlConfiguration { UserAgentString = "aaa" }; _crawlContext.Scheduler = _fakeScheduler.Object; _unitUnderTest = new CrawlDecisionMaker(); }
public PageCrawlCompletedArgs(CrawlContext crawlContext, CrawledPage crawledPage) : base(crawlContext) { if (crawledPage == null) { throw new ArgumentNullException("crawledPage"); } CrawledPage = crawledPage; }
/// <summary> /// /// </summary> /// <param name="crawlContext">上下文</param> /// <param name="crawlResult">抓取结果</param> public CrawlCompletedArgs(CrawlContext crawlContext, CrawlResult crawlResult) : base(crawlContext) { if (crawlResult == null) { throw new ArgumentNullException("crawlResult"); } Result = crawlResult; }
public PageCrawlEventStartingEventArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl) : base(crawlContext) { if (pageToCrawl == null) { throw new ArgumentNullException(nameof(pageToCrawl)); } PageToCrawl = pageToCrawl; }
public PageCrawlEventCompletedEventArgs(CrawlContext crawlContext, CrawledPage crawledPage) : base(crawlContext) { if (crawledPage == null) { throw new ArgumentNullException(nameof(crawledPage)); } CrawledPage = crawledPage; }
private void ProcessCrawledPage(CrawlContext crawlContext, CrawledPage crawledPage) { if (!IsHttpStatusInConfig(crawledPage)) { return; } if (!IsMimeTypesToProcessInConfig(crawledPage)) { return; } int timeoutInMilliSecs = _config.MaxPageProcessorTimeInMilliSecs; IEnumerable <ICrawlProcessor> processors = crawlContext.CrawlBag.GoDaddyProcessorContext.CrawlProcessors; //Did not do a parallel.ForEach because it would spawn to many threads and cause heavy thrashing, most processors would take a up to 30 secs to finish foreach (ICrawlProcessor processor in processors) { Stopwatch timer = Stopwatch.StartNew(); try { processor.ProcessCrawledPage(crawlContext, crawledPage); timer.Stop(); if (timer.ElapsedMilliseconds > timeoutInMilliSecs) { _logger.ErrorFormat( "Crawled page processor [{0}] completed processing page [{1}] in [{2}] millisecs, which is above configuration value MaxPageProcessorTimeInMilliSecs", processor.ToString(), crawledPage.Uri, timer.ElapsedMilliseconds); } else { _logger.DebugFormat( "Crawled page processor [{0}] completed processing page [{1}] in [{2}] millisecs", processor.ToString(), crawledPage.Uri, timer.ElapsedMilliseconds); } } catch (Exception e) { _logger.ErrorFormat( "Crawled page processor [{0}] threw exception while processing page [{1}]", processor.ToString(), crawledPage.Uri); _logger.Error(e); } finally { if (timer != null && timer.IsRunning) { timer.Stop(); } } } }
public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision{Allow = false, Reason = "Null crawled page"}; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if(string.IsNullOrWhiteSpace(crawledPage.Content.Text)) return new CrawlDecision { Allow = false, Reason = "Page has no content" }; if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; return new CrawlDecision{Allow = true}; }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if(pageToCrawl == null) return new CrawlDecision { Allow = false, Reason = "Null page to crawl" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) }; if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" }; //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)? if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 && crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }; } int pagesCrawledInThisDomain = 0; if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) }; } if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; return new CrawlDecision { Allow = true }; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration, null); _crawlContext.Scheduler = _scheduler; }
public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (crawledPage.HttpWebResponse == null) return new CrawlDecision { Allow = false, Reason = "Null HttpWebResponse" }; if (crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) return new CrawlDecision { Allow = false, Reason = "HttpStatusCode is not 200" }; string pageContentType = crawledPage.HttpWebResponse.ContentType.ToLower().Trim(); bool isDownloadable = false; List<string> cleanDownloadableContentTypes = crawlContext.CrawlConfiguration.DownloadableContentTypes .Split(',') .Select(t => t.Trim()) .Where(t => !string.IsNullOrEmpty(t)) .ToList(); foreach (string downloadableContentType in cleanDownloadableContentTypes) { if (pageContentType.Contains(downloadableContentType.ToLower().Trim())) { isDownloadable = true; break; } } if (!isDownloadable) return new CrawlDecision { Allow = false, Reason = "Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes) }; if (crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.HttpWebResponse.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes) return new CrawlDecision { Allow = false, Reason = string.Format("Page size of [{0}] bytes is above the max allowable of [{1}] bytes", crawledPage.HttpWebResponse.ContentLength, crawlContext.CrawlConfiguration.MaxPageSizeInBytes) }; return new CrawlDecision { Allow = true }; }
public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (crawledPage.WebException == null) return new CrawlDecision { Allow = false, Reason = "WebException did not occur"}; if (crawlContext.CrawlConfiguration.MaxRetryCount < 1) return new CrawlDecision { Allow = false, Reason = "MaxRetryCount is less than 1"}; if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount) return new CrawlDecision {Allow = false, Reason = "MaxRetryCount has been reached"}; return new CrawlDecision { Allow = true }; }