private CrawlDecision ShouldCrawlPageContent(CrawledPage page, CrawlContext context) { var result = new CrawlDecision(); if (page.Uri.ToString().Contains("product") || //page.Uri.ToString().Contains("lenovo") || //page.Uri.ToString().Contains("laptop") || page.Uri.ToString().Contains("productVariantGroup") || page.Uri.ToString().Contains("-pc")) { result.Allow = true; if (page.Uri.ToString().Contains("-pch")) { result.Reason = "Not a product"; result.Allow = false; } } else { result.Reason = "Not a product"; result.Allow = false; } return(result); }
public WebSpider() { _crawler = new PoliteWebCrawler(); _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; _crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { CrawlDecision decision = new CrawlDecision { Allow = true }; var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd"); var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link"); if (isCrawlDepth1 || isCrawlDepth2) { return new CrawlDecision { Allow = false, Reason = "Dont want to crawl google pages" } } ; return(decision); }); }
protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl) { if (_maxPagesToCrawlLimitReachedOrScheduled) { return(false); } CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext); if (!shouldCrawlPageDecision.Allow && shouldCrawlPageDecision.Reason.Contains("MaxPagesToCrawl limit of")) { _maxPagesToCrawlLimitReachedOrScheduled = true; _logger.Info("MaxPagesToCrawlLimit has been reached or scheduled. No more pages will be scheduled."); return(false); } if (shouldCrawlPageDecision.Allow) { shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : new CrawlDecision { Allow = true } } ; if (!shouldCrawlPageDecision.Allow) { _logger.DebugFormat("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason); } SignalCrawlStopIfNeeded(shouldCrawlPageDecision); return(shouldCrawlPageDecision.Allow); }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl == null) { return(CrawlDecision.DisallowCrawl("Null crawled page")); } if (crawlContext == null) { return(CrawlDecision.DisallowCrawl("Null crawl context")); } if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) { return(CrawlDecision.DisallowCrawl("Crawl depth is above max")); } if (!pageToCrawl.Uri.Scheme.StartsWith("http")) { return(CrawlDecision.DisallowCrawl("Scheme does not begin with http")); } //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)? if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 && crawlContext.CrawledCount > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return(CrawlDecision.DisallowCrawl(string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl))); } return(CrawlDecision.AllowCrawl()); }
public void ShouldCrawlPage_NullCrawlContext_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("http://a.com/")), null); Assert.IsFalse(result.Allow); Assert.AreEqual("Null crawl context", result.Reason); }
public void ShouldCrawlPageLinks_IsExternalPageLinksCrawlingEnabledFalse_InternalLink_ReturnsTrue() { CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks( new CrawledPage(new Uri("http://b.com/a.html")) { Content = new PageContent { Text = "aaaa" }, IsInternal = true }, new CrawlContext { RootUri = new Uri("http://a.com/ "), CrawlConfiguration = new CrawlConfiguration { IsExternalPageLinksCrawlingEnabled = false } }); Assert.AreEqual(true, result.Allow); Assert.AreEqual("", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) { return(CrawlDecision.DisallowCrawl("Null crawled page")); } if (crawlContext == null) { return(CrawlDecision.DisallowCrawl("Null crawl context")); } if (crawledPage.Exception == null) { return(CrawlDecision.DisallowCrawl("WebException did not occur")); } if (crawlContext.CrawlConfiguration.MaxRetryCount < 1) { return(CrawlDecision.AllowCrawl("无限次重试")); } if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount) { return(CrawlDecision.DisallowCrawl("MaxRetryCount has been reached")); } return(CrawlDecision.AllowCrawl()); }
public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_IsRetry_ReturnsTrue() { Uri uri = new Uri("http://a.com/"); CrawlConfiguration config = new CrawlConfiguration { MaxPagesToCrawlPerDomain = 100 }; ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>(); countByDomain.TryAdd(uri.Authority, 100); CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = config, CrawlStartDate = DateTime.Now, CrawlCountByDomain = countByDomain }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage")) { IsRetry = true, IsInternal = true }, crawlContext); Assert.IsTrue(result.Allow); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl) { CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext); if (shouldCrawlPageDecision.Allow) { shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : new CrawlDecision { Allow = true } } ; if (shouldCrawlPageDecision.Allow) { AddPageToContext(pageToCrawl); } else { _logger.DebugFormat("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason); } SignalCrawlStopIfNeeded(shouldCrawlPageDecision); return(shouldCrawlPageDecision.Allow); }
public void ShouldCrawlPage_NonHttpOrHttpsSchemes_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("file:///C:/Users/")), _crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Scheme does not begin with http", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("mailto:[email protected]")), _crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Scheme does not begin with http", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("ftp://[email protected]")), _crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Scheme does not begin with http", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("callto:+1234567")), _crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Scheme does not begin with http", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("tel:+1234567")), _crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Scheme does not begin with http", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
public void ShouldDownloadPageContent_NullCrawledPage_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(null, new CrawlContext()); Assert.AreEqual(false, result.Allow); Assert.AreEqual("Null crawled page", result.Reason); }
public void ShouldDownloadPageContent_NullCrawlContext_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(new CrawledPage(new Uri("http://a.com/a.html")), null); Assert.IsFalse(result.Allow); Assert.AreEqual("Null crawl context", result.Reason); }
public void Constructor_ValidUri_CreatesInstance() { CrawlDecision unitUnderTest = new CrawlDecision(); Assert.AreEqual(false, unitUnderTest.Allow); Assert.AreEqual("", unitUnderTest.Reason); }
public void ShouldCrawlPageLinks_IsAboveMaxCrawlDepth_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks( new CrawledPage(new Uri("http://b.com/a.html")) { Content = new PageContent { Text = "aaaa" }, IsInternal = true, CrawlDepth = 3 }, new CrawlContext { RootUri = new Uri("http://a.com/ "), CrawlConfiguration = new CrawlConfiguration { MaxCrawlDepth = 2 } }); Assert.AreEqual(false, result.Allow); Assert.AreEqual("Crawl depth is above max", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl) { CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext); if (!shouldCrawlPageDecision.Allow && shouldCrawlPageDecision.Reason.Contains("MaxPagesToCrawl limit of")) { _logger.LogInformation("MaxPagesToCrawlLimit has been reached or scheduled. No more pages will be scheduled."); return(false); } if (shouldCrawlPageDecision.Allow) { shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : CrawlDecision.AllowCrawl(); } if (!shouldCrawlPageDecision.Allow) { _logger.LogDebug("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason); //FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason); } return(shouldCrawlPageDecision.Allow); }
public void ShouldCrawlPageLinks_NullCrawledPage_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks(null, new CrawlContext()); Assert.IsFalse(result.Allow); Assert.AreEqual("Null crawled page", result.Reason); }
public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_ReturnsFalse() { Uri uri = new Uri("http://a.com/"); CrawlConfiguration config = new CrawlConfiguration { MaxPagesToCrawlPerDomain = 100 }; ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>(); countByDomain.TryAdd(uri.Authority, 100); CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = config, CrawlStartDate = DateTime.Now, CrawlCountByDomain = countByDomain }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage")) { IsInternal = true }, crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("MaxPagesToCrawlPerDomain limit of [100] has been reached for domain [a.com]", result.Reason); Assert.IsFalse(crawlContext.IsCrawlStopRequested); }
public void ShouldCrawlPage_NullPageToCrawl_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldCrawlPage(null, _crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Null page to crawl", result.Reason); Assert.IsFalse(_crawlContext.IsCrawlStopRequested); }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException("uri"); } CrawledPage crawledPage = new CrawledPage(uri); HttpWebRequest request = null; HttpWebResponse response = null; try { request = BuildRequestObject(uri); response = (HttpWebResponse)request.GetResponse(); ProcessResponseObject(response); } catch (WebException e) { crawledPage.WebException = e; if (e.Response != null) { response = (HttpWebResponse)e.Response; } _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } catch (Exception e) { _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } finally { crawledPage.HttpWebRequest = request; if (response != null) { crawledPage.HttpWebResponse = response; CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.Content = _extractor.GetContent(response); } else { _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } response.Close();//Should already be closed by _extractor but just being safe } } return(crawledPage); }
public void Constructor_ValidUri_CreatesInstance() { var unitUnderTest = new CrawlDecision(); Assert.AreEqual(false, unitUnderTest.Allow); Assert.AreEqual("", unitUnderTest.Reason); Assert.IsFalse(unitUnderTest.ShouldHardStopCrawl); Assert.IsFalse(unitUnderTest.ShouldStopCrawl); }
public void ShouldDownloadPageContent_NonHtmlPage_ReturnsFalse() { Uri imageUrl = new Uri("http://localhost:1111/Content/themes/base/images/ui-bg_flat_0_aaaaaa_40x100.png"); CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(new PageRequester(_crawlContext.CrawlConfiguration).MakeRequest(imageUrl), _crawlContext); Assert.AreEqual(false, result.Allow); Assert.AreEqual("Content type is not any of the following: text/html", result.Reason); }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException("uri"); } CrawledPage crawledPage = new CrawledPage(uri); HttpWebRequest request = null; HttpWebResponse response = null; try { request = BuildRequestObject(uri); response = (HttpWebResponse)request.GetResponse(); } catch (WebException e) { crawledPage.WebException = e; if (e.Response != null) { response = (HttpWebResponse)e.Response; } _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } catch (Exception e) { _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } finally { crawledPage.HttpWebRequest = request; if (response != null) { crawledPage.HttpWebResponse = response; CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.RawContent = GetRawHtml(response, uri); crawledPage.PageSizeInBytes = Encoding.UTF8.GetBytes(crawledPage.RawContent).Length; } else { _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } response.Close(); } } return(crawledPage); }
public void ShouldDownloadPageContent_HttpStatusNon200_ReturnsFalse() { Uri non200Uri = new Uri("http://localhost:1111/HttpResponse/Status403"); CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(new PageRequester(_crawlContext.CrawlConfiguration).MakeRequest(non200Uri), new CrawlContext()); Assert.AreEqual(false, result.Allow); Assert.AreEqual("HttpStatusCode is not 200", result.Reason); }
public void ShouldRecrawlPage_NullPageToCrawl_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldRecrawlPage(null, _crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Null crawled page", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = GetManuallyConfiguredWebCrawler(siteToCrawl); //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it. //For example http://a.com/ghost, would not get crawled if the link were found during the crawl. //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled. //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run. crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { //if (!pageToCrawl.Uri.AbsoluteUri.Contains("chicken") && !pageToCrawl.Uri.AbsoluteUri.Contains("Chicken")) if (!pageToCrawl.Uri.AbsoluteUri.Contains(category.Replace(" ", "+")) || /*pageToCrawl.Uri.AbsoluteUri.Contains("navid")||*/ pageToCrawl.Uri.AbsoluteUri.Contains("_KG") || pageToCrawl.Uri.AbsoluteUri.Contains("_EA")) { return new CrawlDecision { Allow = false, Reason = "I only crawl the right pages" } } ; return(new CrawlDecision { Allow = true }); }); //Register a lambda expression that will tell Abot to not download the page content for any page after 5th. //Abot will still make the http request but will not read the raw content from the stream //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run /*crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => * { * if (crawlContext.CrawledCount >= 5) * return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" }; * * return new CrawlDecision { Allow = true }; * });*/ //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { CrawlDecision decision = new CrawlDecision { Allow = true }; if (crawledPage.Content.Bytes.Length < 100) { return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" } } ; return(decision); }); return(crawler); }
public void ShouldCrawlPageLinks_NullCrawlContext_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks(new CrawledPage(new Uri("http://a.com/a.html")) { RawContent = "aaaa" }, null); Assert.IsFalse(result.Allow); Assert.AreEqual("Null crawl context", result.Reason); }
public void ShouldCrawlPageLinks_EmptyHtmlContent_ReturnsFalse() { CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks(new CrawledPage(new Uri("http://a.com/")) { RawContent = "" }, new CrawlContext()); Assert.IsFalse(result.Allow); Assert.AreEqual("Page has no content", result.Reason); }
public void ShouldCrawlPage_Duplicate_ReturnsFalse() { _crawlContext.CrawledUrls = new ConcurrentDictionary <string, byte>(); _crawlContext.CrawledUrls.TryAdd("http://a.com/", 0); CrawlDecision result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("http://a.com/")), _crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("Link already crawled", result.Reason); Assert.IsFalse(_crawlContext.IsCrawlStopRequested); }
public async Task ShouldDownloadPageContent_NonHtmlPage_ReturnsFalse() { Uri imageUrl = new Uri(string.Concat(unitTestConfig.SiteSimulatorBaseAddress, "themes/base/images/ui-bg_flat_0_aaaaaa_40x100.png")); CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(await new PageRequester(_crawlContext.CrawlConfiguration).MakeRequestAsync(imageUrl), _crawlContext); Assert.AreEqual(false, result.Allow); Assert.AreEqual("Content type is not any of the following: text/html", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
public async Task ShouldDownloadPageContent_HttpStatusNon200_ReturnsFalse() { Uri non200Uri = new Uri(string.Concat(unitTestConfig.SiteSimulatorBaseAddress, "/HttpResponse/Status403")); CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(await new PageRequester(_crawlContext.CrawlConfiguration).MakeRequestAsync(non200Uri), new CrawlContext()); Assert.AreEqual(false, result.Allow); Assert.AreEqual("HttpStatusCode is not 200", result.Reason); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
protected virtual void SignalCrawlStopIfNeeded(CrawlDecision decision) { if (decision.ShouldHardStopCrawl) { _logger.InfoFormat("Decision marked crawl [Hard Stop] for site [{0}], [{1}]", _crawlContext.RootUri, decision.Reason); _crawlContext.IsCrawlHardStopRequested = decision.ShouldHardStopCrawl; } else if (decision.ShouldStopCrawl) { _logger.InfoFormat("Decision marked crawl [Stop] for site [{0}], [{1}]", _crawlContext.RootUri, decision.Reason); _crawlContext.IsCrawlStopRequested = decision.ShouldStopCrawl; } }