public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if(pageToCrawl == null) return new CrawlDecision { Allow = false, Reason = "Null page to crawl" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" }; if (crawlContext.CrawledCount + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }; } int pagesCrawledInThisDomain = 0; if (crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if(pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) }; } if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; return new CrawlDecision { Allow = true }; }
public CrawlArgs(CrawlContext crawlContext) { if (crawlContext == null) throw new ArgumentNullException("crawlContext"); CrawlContext = crawlContext; }
public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason) : base(crawlContext, pageToCrawl) { if (string.IsNullOrWhiteSpace(disallowedReason)) throw new ArgumentNullException("disallowedReason"); DisallowedReason = disallowedReason; }
public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl) : base(crawlContext) { if (pageToCrawl == null) throw new ArgumentNullException("pageToCrawl"); PageToCrawl = pageToCrawl; }
public PageLinksCrawlDisallowedArgs(CrawlContext crawlContext, CrawledPage crawledPage, string disallowedReason) : base(crawlContext, crawledPage) { if (string.IsNullOrWhiteSpace(disallowedReason)) throw new ArgumentNullException("disallowedReason"); DisallowedReason = disallowedReason; }
public PageCrawlCompletedArgs(CrawlContext crawlContext, CrawledPage crawledPage) : base(crawlContext) { if (crawledPage == null) throw new ArgumentNullException("crawledPage"); CrawledPage = crawledPage; }
public void Constructor_ValidArg_SetsPublicProperty() { CrawledPage page = new CrawledPage(new Uri("http://aaa.com/")); CrawlContext context = new CrawlContext(); CrawlArgs args = new CrawlArgs(context); Assert.AreSame(context, args.CrawlContext); }
public void SetUp() { _fakeScheduler = new Mock<IScheduler>(); _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = new CrawlConfiguration { UserAgentString = "aaa" }; _crawlContext.Scheduler = _fakeScheduler.Object; _unitUnderTest = new CrawlDecisionMaker(); }
public void Constructor_ValidUri_CreatesInstance() { CrawlContext unitUnderTest = new CrawlContext(); Assert.AreEqual(null, unitUnderTest.RootUri); Assert.IsNotNull(unitUnderTest.CrawledUrls); Assert.AreEqual(0, unitUnderTest.CrawledUrls.Count); Assert.IsNotNull(unitUnderTest.CrawlCountByDomain); Assert.AreEqual(0, unitUnderTest.CrawlCountByDomain.Count); Assert.IsNull(unitUnderTest.CrawlConfiguration); Assert.IsNotNull(unitUnderTest.CrawlBag); Assert.AreEqual(false, unitUnderTest.IsCrawlStopRequested); }
public void CrawlBag() { CrawlContext unitUnderTest = new CrawlContext(); unitUnderTest.CrawlBag.SomeVal = "someval"; unitUnderTest.CrawlBag.SomeQueue = new Queue<string>(); unitUnderTest.CrawlBag.SomeQueue.Enqueue("aaa"); unitUnderTest.CrawlBag.SomeQueue.Enqueue("bbb"); Assert.IsNotNull(unitUnderTest.CrawlBag); Assert.AreEqual("someval", unitUnderTest.CrawlBag.SomeVal); Assert.AreEqual("aaa", unitUnderTest.CrawlBag.SomeQueue.Dequeue()); Assert.AreEqual("bbb", unitUnderTest.CrawlBag.SomeQueue.Dequeue()); }
public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision{Allow = false, Reason = "Null crawled page"}; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if(string.IsNullOrWhiteSpace(crawledPage.Content.Text)) return new CrawlDecision { Allow = false, Reason = "Page has no content" }; if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; return new CrawlDecision{Allow = true}; }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if(pageToCrawl == null) return new CrawlDecision { Allow = false, Reason = "Null page to crawl" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) }; if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" }; //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)? if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 && crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }; } int pagesCrawledInThisDomain = 0; if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) }; } if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; return new CrawlDecision { Allow = true }; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled, null, _crawlContext.CrawlConfiguration.IsRespectUrlNamedAnchorOrHashbangEnabled); _crawlContext.Scheduler = _scheduler; }
public void SetUp() { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = new CrawlConfiguration { UserAgentString = "aaa" }; _unitUnderTest = new CrawlDecisionMaker(); }
public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (crawledPage.HttpWebResponse == null) return new CrawlDecision { Allow = false, Reason = "Null HttpWebResponse" }; if (crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) return new CrawlDecision { Allow = false, Reason = "HttpStatusCode is not 200" }; string pageContentType = crawledPage.HttpWebResponse.ContentType.ToLower().Trim(); bool isDownloadable = false; List<string> cleanDownloadableContentTypes = crawlContext.CrawlConfiguration.DownloadableContentTypes .Split(',') .Select(t => t.Trim()) .Where(t => !string.IsNullOrEmpty(t)) .ToList(); foreach (string downloadableContentType in cleanDownloadableContentTypes) { if (pageContentType.Contains(downloadableContentType.ToLower().Trim())) { isDownloadable = true; break; } } if (!isDownloadable) return new CrawlDecision { Allow = false, Reason = "Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes) }; if (crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.HttpWebResponse.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes) return new CrawlDecision { Allow = false, Reason = string.Format("Page size of [{0}] bytes is above the max allowable of [{1}] bytes", crawledPage.HttpWebResponse.ContentLength, crawlContext.CrawlConfiguration.MaxPageSizeInBytes) }; return new CrawlDecision { Allow = true }; }
public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (crawledPage.WebException == null) return new CrawlDecision { Allow = false, Reason = "WebException did not occur"}; if (crawlContext.CrawlConfiguration.MaxRetryCount < 1) return new CrawlDecision { Allow = false, Reason = "MaxRetryCount is less than 1"}; if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount) return new CrawlDecision {Allow = false, Reason = "MaxRetryCount has been reached"}; return new CrawlDecision { Allow = true }; }
public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_IsRetry_ReturnsTrue() { Uri uri = new Uri("http://a.com/"); CrawlConfiguration config = new CrawlConfiguration { MaxPagesToCrawlPerDomain = 100 }; ConcurrentDictionary<string, int> countByDomain = new ConcurrentDictionary<string, int>(); countByDomain.TryAdd(uri.Authority, 100); CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = config, CrawlStartDate = DateTime.Now, CrawlCountByDomain = countByDomain }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage")) { IsRetry = true, IsInternal = true }, crawlContext); Assert.IsTrue(result.Allow); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
public void ShouldCrawlPage_NonDuplicate_ReturnsTrue() { CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = new CrawlConfiguration(), CrawlStartDate = DateTime.Now }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri("http://a.com/")) { IsInternal = true }, crawlContext); Assert.IsTrue(result.Allow); Assert.AreEqual("", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_ReturnsFalse() { Uri uri = new Uri("http://a.com/"); CrawlConfiguration config = new CrawlConfiguration { MaxPagesToCrawlPerDomain = 100 }; ConcurrentDictionary<string,int> countByDomain = new ConcurrentDictionary<string,int>(); countByDomain.TryAdd(uri.Authority, 100); CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = config, CrawlStartDate = DateTime.Now, CrawlCountByDomain = countByDomain }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage")) { IsInternal = true }, crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("MaxPagesToCrawlPerDomain limit of [100] has been reached for domain [a.com]", result.Reason); Assert.IsFalse(crawlContext.IsCrawlStopRequested); }
public void ShouldCrawlPage_OverMaxCrawlDepth_ReturnsFalse() { CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = new CrawlConfiguration { MaxCrawlDepth = 2 } }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri("http://a.com/")) { IsInternal = true, CrawlDepth = 3 }, crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Crawl depth is above max", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
//Delegates for the Abot WebCrawler instance /// <summary> /// Delegate passed to WebCrawler to determine if any parsed links should be /// scheduled or not. Always returns false to override specific Abot.WebCrawler.cs /// behavior (see remarks) allowing total control of scheduled links external to Abot. /// </summary> /// <returns>false</returns> /// <remarks>This will always return false as it prevents the abot crawler from adding links /// and instead we add the links when each crawled page is processed in the /// crawler_ProcessPageCrawlCompleted handler. see line ~795 in Abot.Crawler.WebCrawler.cs /// </remarks> public bool ShouldScheduleLink(Uri uri, CrawledPage page, CrawlContext context) { return false; }
public void ShouldCrawlPage_IsExternalPageCrawlingEnabledFalse_PageIsExternal_ReturnsFalse() { CrawlContext crawlContext = new CrawlContext { CrawlStartDate = DateTime.Now.AddSeconds(-100), CrawlConfiguration = new CrawlConfiguration { CrawlTimeoutSeconds = 0 //equivalent to infinity } }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri("http://a.com/")) { IsInternal = false }, crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Link is external", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
public void ShouldCrawlPage_OverMaxPageToCrawlLimit_ReturnsFalse() { CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = new CrawlConfiguration { MaxPagesToCrawl = 0 } }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("http://a.com/")), crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("MaxPagesToCrawl limit of [0] has been reached", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
/// <summary> /// Delegate passed to WebCrawler which calls functionaltiy to look at /// blacklisted urls when deciding whether a page should be crawled or not. /// If the <paramref name="pageToCrawl"/> Domain is blacklisted, then the /// CrawlDecision.Allow is set to false. This delegate is called after the default /// CrawlDecisionMaker.ShouldCrawlPage() method is called. /// </summary> /// <returns>CrawlDecision</returns> /// <remarks>The default CrawlDecisionMaker.ShouldCrawlPage() method is called first, but then /// this method will be called. No reason to override the default CrawlDecisionMaker, see /// about line ~721 in WebCrawler.cs</remarks> public CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { CrawlDecision decision = null; var domain = pageToCrawl.Uri.GetBaseDomain(); if (_repo.IsBlackListed(domain)) { decision = new CrawlDecision { Allow = false, Reason = string.Format("The domain {0} is blacklisted", domain) }; } else { decision = new CrawlDecision() { Allow = true }; } return decision; }
public void ShouldCrawlPage_ZeroMaxPageToCrawlLimit_ReturnsTrue() { CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = new CrawlConfiguration { MaxPagesToCrawl = 0 }, CrawledCount = 100 }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("http://a.com/")) { IsInternal = true }, crawlContext); Assert.IsTrue(result.Allow); }
public void ShouldCrawlPage_EqualToMaxCrawlDepth_ReturnsTrue() { CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = new CrawlConfiguration { MaxCrawlDepth = 2 } }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri("http://a.com/")) { IsInternal = true, CrawlDepth = 2 }, crawlContext); Assert.IsTrue(result.Allow); }
public void ShouldCrawlPage_IsExternalPageCrawlingEnabledTrue_PageIsExternal_ReturnsTrue() { CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = new CrawlConfiguration { IsExternalPageCrawlingEnabled = true }, CrawlStartDate = DateTime.Now, }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri("http://a.com/")) { IsInternal = false }, crawlContext); Assert.IsTrue(result.Allow); Assert.AreEqual("", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if(_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }
/// <summary> /// Contructor to be used to create an object which will path arugments when robots txt is parsed /// </summary> /// <param name="crawlContext"></param> /// <param name="robots"></param> public RobotsDotTextParseCompletedArgs(CrawlContext crawlContext, IRobots robots) : base(crawlContext) { Robots = robots; }