protected override string GetBaseHrefValue(CrawledPage crawledPage) { string hrefValue = ""; HtmlNode node = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//base"); //Must use node.InnerHtml instead of node.InnerText since "aaa<br />bbb" will be returned as "aaabbb" if (node != null) hrefValue = node.GetAttributeValue("href", "").Trim(); return hrefValue; }
protected override IEnumerable <string> GetHrefValues(CrawledPage crawledPage) { List <string> hrefValues = new List <string>(); if (HasRobotsNoFollow(crawledPage)) { return(hrefValues); } HtmlNodeCollection aTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]"); HtmlNodeCollection areaTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//area[@href]"); hrefValues.AddRange(GetLinks(aTags)); hrefValues.AddRange(GetLinks(areaTags)); return(hrefValues); }
public async Task PageCrawledAsync(CrawledPage crawledPage) { string text = _textExtractor.ExtractText(crawledPage.HtmlDocument); if (text == null) { Console.WriteLine("No content for page {0}", crawledPage?.Uri.AbsoluteUri); return; } _queue.Add(new WebPage(crawledPage.Uri.AbsoluteUri, text)); if (_queue.Count > IndexingBatchSize) { await IndexBatchIfNecessary(); } }
private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; string result = ""; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { result = string.Format("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { result = string.Format("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { result = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri); } log.Info("crawler_ProcessPageCrawlCompleted"); log.Info(result); if (!string.IsNullOrEmpty(crawledPage.Content.Text) && crawledPage.Uri.AbsoluteUri.Contains("/jobs/view/")) { var doc = crawledPage.HtmlDocument; //Html Agility Pack parser //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser try { string positionTitle = doc.DocumentNode.SelectSingleNode("//h1[@class='search_highlight']").InnerText.Trim(); string location = doc.DocumentNode.SelectSingleNode("//div[@class='location search_highlight']").InnerText.Trim(); string companyName = doc.DocumentNode.SelectSingleNode("//span[@class='search_highlight']//a").InnerText.Trim(); string postedDate = doc.DocumentNode.SelectSingleNode("//div[@id='contentHeading']//div[@class='meta']").InnerText.Trim().Split(new char[] { '\n' })[0].Replace("Posted on : ", ""); string jobDescription = doc.DocumentNode.SelectSingleNode("//meta[@name='description']").Attributes["content"].Value; string experience = doc.DocumentNode.SelectSingleNode("//div[@class='field_experience_required']/div[@class='job-level']/span").InnerText.Trim(); log.Info(string.Format("Position: {0};Location: {1}; Company Name: {2}; Posted Date: {3}; Job Desc: {4}; Exp: {5}", positionTitle, location, companyName, postedDate, jobDescription, experience)); } catch (Exception ex) { log.Error(ex.Message); } } }
public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) { return new CrawlDecision { Allow = false, Reason = "Null crawled page" } } ; if (crawlContext == null) { return new CrawlDecision { Allow = false, Reason = "Null crawl context" } } ; if (string.IsNullOrWhiteSpace(crawledPage.Content.Text)) { return new CrawlDecision { Allow = false, Reason = "Page has no content" } } ; if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal) { return new CrawlDecision { Allow = false, Reason = "Link is external" } } ; if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth) { return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" } } ; return(new CrawlDecision { Allow = true }); }
protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage) { //TODO No unit tests cover these lines var shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext); if (shouldRecrawlPageDecision.Allow) { shouldRecrawlPageDecision = (_shouldRecrawlPageDecisionMaker != null) ? _shouldRecrawlPageDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision { Allow = true } } ; if (!shouldRecrawlPageDecision.Allow) { _logger.LogDebug($"Page [{crawledPage.Uri.AbsoluteUri}] not recrawled, [{shouldRecrawlPageDecision.Reason}]"); } else { // Look for the Retry-After header in the response. crawledPage.RetryAfter = null; if (crawledPage.HttpWebResponse != null && crawledPage.HttpWebResponse.Headers != null) { var value = crawledPage.HttpWebResponse.GetResponseHeader("Retry-After"); if (!String.IsNullOrEmpty(value)) { // Try to convert to DateTime first, then in double. DateTime date; double seconds; if (crawledPage.LastRequest.HasValue && DateTime.TryParse(value, out date)) { crawledPage.RetryAfter = (date - crawledPage.LastRequest.Value).TotalSeconds; } else if (double.TryParse(value, out seconds)) { crawledPage.RetryAfter = seconds; } } } } SignalCrawlStopIfNeeded(shouldRecrawlPageDecision); return(shouldRecrawlPageDecision.Allow); }
private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; if (_amazonHelper.IsCaptchaPage(crawledPage)) { lock (_captchaLock) { InvokeIfRequired(() => { var form = new BrowserForm(); form.Browser.Navigate(crawledPage.Uri); form.ShowDialog(); }); } } var products = _amazonHelper.GetProductsFromDetailPage(crawledPage); foreach (var p in products) { if (!_products.Any(x => x.external_product_id == p.external_product_id)) { InvokeIfRequired(() => { _products.Add(p); Application.DoEvents(); }); } } if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { _logger.InfoFormat("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { _logger.InfoFormat("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { _logger.InfoFormat("Page had no content {0}", crawledPage.Uri.AbsoluteUri); } }
public void SetUp() { _uut = new ConstantContactCrawlProcessor(); _fakePrimaryPersistence = new Mock <IPersistenceProvider>(); _fakeBackupPersistence = new Mock <IPersistenceProvider>(); _crawlContext = new CrawlContext(); _crawlContext.CrawlBag.GoDaddyProcessorContext = new ProcessorContext { PrimaryPersistenceProvider = _fakePrimaryPersistence.Object, BackupPersistenceProvider = _fakeBackupPersistence.Object, Domain = new Domain { DomainId = 111 } }; _crawledPage = new CrawledPage(new Uri("http://a.com/")); }
/// <summary> /// Parses html to extract hyperlinks, converts each into an absolute url /// </summary> public virtual IEnumerable <HyperLink> GetLinks(CrawledPage crawledPage) { CheckParams(crawledPage); var timer = Stopwatch.StartNew(); var links = GetUris(crawledPage, GetHrefValues(crawledPage)) .Select(hrv => new HyperLink() { HrefValue = hrv }) .ToList(); timer.Stop(); Log.DebugFormat("{0} parsed links from [{1}] in [{2}] milliseconds", ParserType, crawledPage.Uri, timer.ElapsedMilliseconds); return(links); }
protected override string GetBaseHrefValue(CrawledPage crawledPage) { var baseTag = crawledPage.AngleSharpHtmlDocument.QuerySelector("base"); if (baseTag == null) { return(""); } var baseTagValue = baseTag.Attributes["href"]; if (baseTagValue == null) { return(""); } return(baseTagValue.Value.Trim()); }
protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage) { if (HasRobotsNoFollow(crawledPage)) return null; IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area") .Elements .Where(e => !HasRelNoFollow(e)) .Select(y => y.GetAttribute("href")) .Where(a => !string.IsNullOrWhiteSpace(a)); IEnumerable<string> canonicalHref = crawledPage.CsQueryDocument. Select("link").Elements. Where(e => HasRelCanonicalPointingToDifferentUrl(e, crawledPage.Uri.ToString())). Select(e => e.Attributes["href"]); return hrefValues.Concat(canonicalHref); }
protected virtual bool ShouldCrawlPageLinks(CrawledPage crawledPage) { CrawlDecision shouldCrawlPageLinksDecision = _crawlDecisionMaker.ShouldCrawlPageLinks(crawledPage, _crawlContext); if (shouldCrawlPageLinksDecision.Allow) { shouldCrawlPageLinksDecision = (_shouldCrawlPageLinksDecisionMaker != null) ? _shouldCrawlPageLinksDecisionMaker.Invoke(crawledPage, _crawlContext) : CrawlDecision.AllowCrawl(); } if (!shouldCrawlPageLinksDecision.Allow) { _logger.LogDebug("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldCrawlPageLinksDecision.Reason); FirePageLinksCrawlDisallowedEventAsync(crawledPage, shouldCrawlPageLinksDecision.Reason); //FirePageLinksCrawlDisallowedEvent(crawledPage, shouldCrawlPageLinksDecision.Reason); } return(shouldCrawlPageLinksDecision.Allow); }
public void SetUp() { _dummySemList = new SemList(); _uut = new SemKeywordCrawlProcessor(_dummySemList); _fakePrimaryPersistence = new Mock <IPersistenceProvider>(); _crawlContext = new CrawlContext(); _crawlContext.CrawlBag.GoDaddyProcessorContext = new ProcessorContext { PrimaryPersistenceProvider = _fakePrimaryPersistence.Object, BackupPersistenceProvider = null,//no need since this is tested in another child class Domain = new Domain { DomainId = 111 } }; _crawledPage = new CrawledPage(new Uri("http://a.com/")); }
void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); } }
public void ProcessCrawledPage(CrawlContext crawlContext, CrawledPage crawledPage) { ProcessorResult result; bool isFound = false; if (!crawledPage.IsRoot) { return; } result = ProcessPage(crawlContext, crawledPage); if (result.IsAHit) { PageSave(crawlContext, crawledPage, result); } }
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); } if (crawledPage.Uri.AbsoluteUri != "https://belsat.eu/ru/news/") { Parser.Parse(crawledPage.Content.Text, crawledPage.Uri); } //crawledPage.Content.Text //raw html }
private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; string result = ""; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { result = string.Format("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { result = string.Format("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { result = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri); } log.Info("crawler_ProcessPageCrawlCompleted"); log.Info(result); if (!string.IsNullOrEmpty(crawledPage.Content.Text) && crawledPage.Uri.AbsoluteUri.Contains("/jobs/")) { var doc = crawledPage.HtmlDocument; //Html Agility Pack parser //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser try { string positionTitle = doc.DocumentNode.SelectSingleNode("//section[@class='box box_r']/ul/li/p").InnerText.Trim(); string location = doc.DocumentNode.SelectNodes("//div[@class='cm-12 box_i bWord']/ul/li")[1].InnerText.Trim(); string companyName = doc.DocumentNode.SelectSingleNode("//a[@id='urlverofertas']").InnerText.Trim(); string postedDate = doc.DocumentNode.SelectSingleNode("//div[@class='cm-12 box_i bWord']/ul/p/span[@class='info_pub']/span").InnerText.Trim(); string jobDescription = doc.DocumentNode.SelectNodes("//div[@class='cm-12 box_i bWord']/ul/li")[2].InnerHtml.Trim(); //log.Info(string.Format("Position Title: {0}\nLocation: {1}\nCompany Name: {2}\nPosted Date: {3}\nJob Desc: {4}", positionTitle, location, companyName, postedDate, jobDescription)); } catch (Exception ex) { log.Error(ex.Message); } } }
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List <Uri> links = new List <Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored) _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny <Uri>())).Returns(_fakeRobotsDotText.Object); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; //BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2; //This is less than the crawl delay (Should Be used) _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeHttpRequester.VerifyAll(); _fakeHyperLinkParser.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), 2000), Times.Exactly(1)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED }
private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; string result = ""; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { result = string.Format("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { result = string.Format("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { result = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri); } log.Info("crawler_ProcessPageCrawlCompleted"); log.Info(result); if (!string.IsNullOrEmpty(crawledPage.Content.Text) && crawledPage.Uri.AbsoluteUri.Contains("/en/job/")) { var doc = crawledPage.HtmlDocument; //Html Agility Pack parser //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser try { string positionTitle = doc.DocumentNode.SelectSingleNode("//h1[@id='position_title']").InnerText.Trim(); string location = doc.DocumentNode.SelectSingleNode("//span[@id='single_work_location']").InnerText.Trim(); string companyName = doc.DocumentNode.SelectSingleNode("//div[@id='company_name']/a").InnerText.Trim(); string postedDate = doc.DocumentNode.SelectSingleNode("//p[@id='posting_date']/span").InnerText.Trim(); string jobDescription = doc.DocumentNode.SelectSingleNode("//div[@id='job_description']").InnerHtml.Trim(); //log.Info(string.Format("Position Title: {0}\nLocation: {1}\nCompany Name: {2}\nPosted Date: {3}\nJob Desc: {4}", positionTitle, location, companyName, postedDate, jobDescription)); } catch (Exception ex) { log.Error(ex.Message); } } }
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; if (isContains(crawledPage.Uri.AbsoluteUri)) { // Create a logger for use in this class //log4net.ILog log = log4net.LogManager.GetLogger(System.Reflection.MethodBase.GetCurrentMethod().DeclaringType); //log.Info("Page URL : " + crawledPage.Uri.AbsoluteUri); CrawledItems.Add(new Crawled() { Url = crawledPage.Uri.AbsoluteUri, Description = crawledPage.Content.Text }); if (CrawledItems.Count >= 10) { AddToDatabase(CrawledItems.ToArray()); CrawledItems.Clear(); Console.WriteLine("Submit 10 new records"); } } /* * int count = 0; * * foreach (var item in crawledPage.ParsedLinks) * { * log.Info("link :"+ ++count +item.AbsoluteUri+", "+(item.IsFile?"ini file":"ini bukan file")); * }*/ //log.Info(crawledPage.Content.Text); if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } //log.Info(string.Format("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri)); Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); if (string.IsNullOrEmpty(crawledPage.Content.Text)) { Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); } }
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_UsesCorrectUserAgentString() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List <Uri> links = new List <Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny <Uri>())).Returns(_fakeRobotsDotText.Object); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _dummyConfiguration.RobotsDotTextUserAgentString = "abcd"; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeRobotsDotText.Verify(f => f.GetCrawlDelay(_dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1)); _fakeRobotsDotText.Verify(f => f.IsUrlAllowed(uri1.AbsoluteUri, _dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1)); _fakeRobotsDotText.Verify(f => f.IsUrlAllowed(uri1.AbsoluteUri, _dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1)); }
public static string FormatOutput(CrawledPage page) { string linksDisplay; if (page.PageLinks == null) { linksDisplay = "Could not download content"; } else if (page.PageLinks.Count == 0) { linksDisplay = "No links found"; } else { linksDisplay = $"{string.Join("\n", page.PageLinks)}\n[{page.PageLinks.Count} links]"; } return($"Visited Page: {page.PageUri} ({page.FirstVisitedDepth})\n------------------\n{linksDisplay}\n"); }
public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision{Allow = false, Reason = "Null crawled page"}; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if(string.IsNullOrWhiteSpace(crawledPage.Content.Text)) return new CrawlDecision { Allow = false, Reason = "Page has no content" }; if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; return new CrawlDecision{Allow = true}; }
async void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; string uri = crawledPage.Uri.AbsoluteUri; if (crawledPage.WebException != null || crawledPage.HttpWebResponse?.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}: exception '{1}', response status {2}", uri, crawledPage.WebException?.Message, crawledPage.HttpWebResponse?.StatusCode); return; } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { Console.WriteLine("Page had no content {0}", uri); return; } await _handler.PageCrawledAsync(crawledPage); }
public IRobotsDotText Find(Uri rootUri) { if (rootUri == null) { throw new ArgumentNullException("rootUri"); } Uri robotsUri = new Uri(rootUri, "/robots.txt"); CrawledPage page = _pageRequester.MakeRequest(robotsUri); if (page == null || page.WebException != null || page.HttpWebResponse == null || page.HttpWebResponse.StatusCode != HttpStatusCode.OK) { _logger.DebugFormat("Did not find robots.txt file at [{0}]", robotsUri); return(null); } _logger.DebugFormat("Found robots.txt file at [{0}]", robotsUri); return(new RobotsDotText(rootUri, page.Content.Text)); }
protected virtual bool HasRobotsNoFollow(CrawledPage crawledPage) { if (!IsRespectMetaRobotsNoFollowEnabled) { return(false); } string robotsMeta = robotsMeta = GetMetaRobotsValue(crawledPage); bool isRobotsNoFollow = robotsMeta != null && (robotsMeta.ToLower().Contains("nofollow") || robotsMeta.ToLower().Contains("none")); if (isRobotsNoFollow) { _logger.InfoFormat("Robots NoFollow detected on uri [{0}], will not crawl links on this page.", crawledPage.Uri); } return(isRobotsNoFollow); }
protected override IEnumerable <string> GetHrefValues(CrawledPage crawledPage) { var hrefValues = new List <string>(); if (HasRobotsNoFollow(crawledPage)) { return(hrefValues); } var aTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]"); var areaTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//area[@href]"); var canonicals = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//link[@rel='canonical'][@href]"); hrefValues.AddRange(GetLinks(aTags)); hrefValues.AddRange(GetLinks(areaTags)); hrefValues.AddRange(GetLinks(canonicals)); return(hrefValues); }
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); } //if (string.IsNullOrEmpty(crawledPage.Content.Text)) //Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); //var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser }
protected override IEnumerable <string> GetHrefValues(CrawledPage crawledPage) { if (HasRobotsNoFollow(crawledPage)) { return(null); } IEnumerable <string> hrefValues = crawledPage.AngleSharpHtmlDocument.QuerySelectorAll("a, area") .Where(e => !HasRelNoFollow(e)) .Select(y => y.GetAttribute("href")) .Where(a => !string.IsNullOrWhiteSpace(a)); IEnumerable <string> canonicalHref = crawledPage.AngleSharpHtmlDocument .QuerySelectorAll("link") .Where(e => HasRelCanonicalPointingToDifferentUrl(e, crawledPage.Uri.ToString())) .Select(e => e.GetAttribute("href")); return(hrefValues.Concat(canonicalHref)); }
protected override ProcessorResult ProcessPage(CrawlContext crawlContext, CrawledPage crawledPage) { nodeQueryList.Add(new KeyValuePair <string, string>("cart", "//a[contains(@href, 'cart')]")); nodeQueryList.Add(new KeyValuePair <string, string>("shoppingcart", "//a[contains(@href, 'shoppingcart')]")); nodeQueryList.Add(new KeyValuePair <string, string>("checkout", "//a[contains(@href, 'checkout')]")); ProcessorResult result = new ProcessorResult { UniqueAttributeId = 16 }; result.IsAHit = FindTags(crawledPage, crawlContext.RootUri.DnsSafeHost.ToLower()); if (result.IsAHit) { result.Attributes.Add(result.UniqueAttributeId.ToString(), "true"); } return(result); }
protected override IEnumerable <string> GetHrefValues(CrawledPage crawledPage) { List <string> hrefValues = new List <string>(); if (HasRobotsNoFollow(crawledPage)) { return(hrefValues); } //HtmlNodeCollection productLinkNodes = _helper.GetProductLinkNodes(crawledPage.HtmlDocument); //hrefValues.AddRange(GetLinks(productLinkNodes)); var productLinks = _helper.GetProductLinksFromListPage(crawledPage); hrefValues.AddRange(productLinks); HtmlNodeCollection pageLinkNodes = _helper.GetPageLinkNodesFromListPage(crawledPage); hrefValues.AddRange(GetLinks(pageLinkNodes)); return(hrefValues.Distinct().ToList()); }
protected virtual void SchedulePageLinks(CrawledPage crawledPage) { IEnumerable <Uri> crawledPageLinks = _hyperLinkParser.GetLinks(crawledPage); foreach (Uri uri in crawledPageLinks) { //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed) try { PageToCrawl page = new CrawledPage(uri); page.ParentUri = crawledPage.Uri; page.CrawlDepth = crawledPage.CrawlDepth + 1; page.IsInternal = _isInternalDecisionMaker(uri, _crawlContext.RootUri); page.IsRoot = false; _scheduler.Add(page); } catch {} } }
protected virtual void ProcessPage(PageToCrawl pageToCrawl) { if (pageToCrawl == null) { return; } try { ThrowIfCancellationRequested(); AddPageToContext(pageToCrawl); CrawledPage crawledPage = CrawlThePage(pageToCrawl); FirePageCrawlCompletedEventAsync(crawledPage); bool shouldCrawlPageLinks = ShouldCrawlPageLinks(crawledPage); if (shouldCrawlPageLinks) { ParsePageDocument(crawledPage); SchedulePageLinks(crawledPage); } if (crawledPage.IsRetry || ShouldRecrawlPage(crawledPage)) { crawledPage.IsRetry = true; _scheduler.Add(crawledPage); } } catch (OperationCanceledException oce) { _logger.LogDebug("Thread cancelled while crawling/processing page [{0}]", pageToCrawl.Uri); throw oce; } catch (Exception e) { _logger.LogError(e, e.Message); _crawlResult.Error = e; _crawlContext.IsCrawlHardStopRequested = true; } }
protected virtual void FirePageCrawlCompletedEventAsync(CrawledPage crawledPage) { EventHandler<PageCrawlCompletedArgs> threadSafeEvent = PageCrawlCompletedAsync; if (threadSafeEvent == null) return; if (_scheduler.Count == 0) { //Must be fired synchronously to avoid main thread exiting before completion of event handler for first or last page crawled try { threadSafeEvent(this, new PageCrawlCompletedArgs(_crawlContext, crawledPage)); } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the PageCrawlCompleted event for url:" + crawledPage.Uri.AbsoluteUri); _logger.Error(e); } } else { //Fire each subscribers delegate async foreach (EventHandler<PageCrawlCompletedArgs> del in threadSafeEvent.GetInvocationList()) { del.BeginInvoke(this, new PageCrawlCompletedArgs(_crawlContext, crawledPage), null, null); } } }
protected virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage) { CrawlDecision decision = _crawlDecisionMaker.ShouldDownloadPageContent(crawledPage, _crawlContext); if (decision.Allow) decision = (_shouldDownloadPageContentDecisionMaker != null) ? _shouldDownloadPageContentDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision { Allow = true }; SignalCrawlStopIfNeeded(decision); return decision; }
protected override string GetBaseHrefValue(CrawledPage crawledPage) { string baseTagValue = crawledPage.CsQueryDocument.Select("base").Attr("href") ?? ""; return baseTagValue.Trim(); }
protected virtual void FirePageLinksCrawlDisallowedEventAsync(CrawledPage crawledPage, string reason) { EventHandler<PageLinksCrawlDisallowedArgs> threadSafeEvent = PageLinksCrawlDisallowedAsync; if (threadSafeEvent != null) { //Fire each subscribers delegate async foreach (EventHandler<PageLinksCrawlDisallowedArgs> del in threadSafeEvent.GetInvocationList()) { del.BeginInvoke(this, new PageLinksCrawlDisallowedArgs(_crawlContext, crawledPage, reason), null, null); } } }
protected virtual void FirePageLinksCrawlDisallowedEvent(CrawledPage crawledPage, string reason) { try { EventHandler<PageLinksCrawlDisallowedArgs> threadSafeEvent = PageLinksCrawlDisallowed; if (threadSafeEvent != null) threadSafeEvent(this, new PageLinksCrawlDisallowedArgs(_crawlContext, crawledPage, reason)); } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the PageLinksCrawlDisallowed event for url:" + crawledPage.Uri.AbsoluteUri); _logger.Error(e); } }
protected virtual bool IsRedirect(CrawledPage crawledPage) { bool isRedirect = false; if (crawledPage.HttpWebResponse != null) { isRedirect = (_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled && crawledPage.HttpWebResponse.ResponseUri != null && crawledPage.HttpWebResponse.ResponseUri.AbsoluteUri != crawledPage.Uri.AbsoluteUri) || (!_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled && (int) crawledPage.HttpWebResponse.StatusCode >= 300 && (int) crawledPage.HttpWebResponse.StatusCode <= 399); } return isRedirect; }
protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage) { List<string> hrefValues = new List<string>(); if (HasRobotsNoFollow(crawledPage)) return hrefValues; HtmlNodeCollection aTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]"); HtmlNodeCollection areaTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//area[@href]"); HtmlNodeCollection canonicals = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//link[@rel='canonical'][@href]"); hrefValues.AddRange(GetLinks(aTags)); hrefValues.AddRange(GetLinks(areaTags)); hrefValues.AddRange(GetLinks(canonicals)); hrefValues.AddRange(GetLinksByKeyword(crawledPage, "KeywordExternalLink")); hrefValues.AddRange(GetLinksByKeyword(crawledPage, "KeywordID")); return hrefValues; }
/// <summary> /// Retrieve the URI where the specified crawled page was redirected. /// </summary> /// <remarks> /// If HTTP auto redirections is disabled, this value is stored in the 'Location' header of the response. /// If auto redirections is enabled, this value is stored in the response's ResponseUri property. /// </remarks> protected virtual Uri ExtractRedirectUri(CrawledPage crawledPage) { Uri locationUri; if (_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled) { // For auto redirects, look for the response uri. locationUri = crawledPage.HttpWebResponse.ResponseUri; } else { // For manual redirects, we need to look for the location header. var location = crawledPage.HttpWebResponse.Headers["Location"]; // Check if the location is absolute. If not, create an absolute uri. if (!Uri.TryCreate(location, UriKind.Absolute, out locationUri)) { Uri baseUri = new Uri(crawledPage.Uri.GetLeftPart(UriPartial.Authority)); locationUri = new Uri(baseUri, location); } } return locationUri; }
private IEnumerable<string> GetLinksByKeyword(CrawledPage crawledPage, string keyword) { List<string> result = new List<string>(); string keywordValue = string.Empty; if (Utility.GetConfigurationValue(_config, keyword, out keywordValue)) { if (!string.IsNullOrEmpty(keywordValue)) { switch (keyword) { case "KeywordExternalLink": MatchCollection matches = Regex.Matches(crawledPage.HtmlDocument.DocumentNode.OuterHtml, keywordValue + @"":"[A-Za-z|-]+/\d{1,10}",", RegexOptions.IgnoreCase); foreach (Match item in matches) { Match match = Regex.Match(item.Value, @"[A-Za-z|-]+/\d{1,10}", RegexOptions.IgnoreCase); result.Add(Utility.ConvertRelativeUrl(crawledPage.ParentUri.AbsoluteUri, match.Value)); } break; case "KeywordID": matches = Regex.Matches(crawledPage.HtmlDocument.DocumentNode.OuterHtml, @"],"" + keywordValue + @"":\d{1,10},", RegexOptions.IgnoreCase); foreach (Match item in matches) { Match match = Regex.Match(item.Value, @"\d{1,10}", RegexOptions.IgnoreCase); result.Add(Utility.ConvertRelativeUrl(crawledPage.ParentUri.AbsoluteUri, "/hands-on-labs/" + match.Value)); } break; default: break; } } } return result; }
protected override string GetMetaRobotsValue(CrawledPage crawledPage) { return crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content"); }
protected virtual void ParsePageLinks(CrawledPage crawledPage) { crawledPage.ParsedLinks = _hyperLinkParser.GetLinks(crawledPage); }
protected virtual void ProcessRedirect(CrawledPage crawledPage) { if (crawledPage.RedirectPosition >= 20) _logger.WarnFormat("Page [{0}] is part of a chain of 20 or more consecutive redirects, redirects for this chain will now be aborted.", crawledPage.Uri); try { var uri = ExtractRedirectUri(crawledPage); PageToCrawl page = new PageToCrawl(uri); page.ParentUri = crawledPage.ParentUri; page.CrawlDepth = crawledPage.CrawlDepth; page.IsInternal = IsInternalUri(uri); page.IsRoot = false; page.RedirectedFrom = crawledPage; page.RedirectPosition = crawledPage.RedirectPosition + 1; crawledPage.RedirectedTo = page; _logger.DebugFormat("Page [{0}] is requesting that it be redirect to [{1}]", crawledPage.Uri, crawledPage.RedirectedTo.Uri); if (ShouldSchedulePageLink(page)) { if (_scheduler.IsUriKnown(uri)) { _logger.InfoFormat("Page [{0}] is redirected to [{1}], which is a page already crawled.", crawledPage.Uri, crawledPage.RedirectedTo.Uri); } else { _logger.InfoFormat("Page [{0}] will be redirect to [{1}]", crawledPage.Uri, crawledPage.RedirectedTo.Uri); _scheduler.Add(page); } } } catch {} }
protected virtual void SchedulePageLinks(CrawledPage crawledPage) { foreach (Uri uri in crawledPage.ParsedLinks) { // First validate that the link was not already visited or added to the list of pages to visit, so we don't // make the same validation and fire the same events twice. if (!_scheduler.IsUriKnown(uri) && (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) { try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed) { PageToCrawl page = new PageToCrawl(uri); page.ParentUri = crawledPage.Uri; page.CrawlDepth = crawledPage.CrawlDepth + 1; page.IsInternal = IsInternalUri(uri); page.IsRoot = false; if (ShouldSchedulePageLink(page)) { _scheduler.Add(page); } } catch { } } // Add this link to the list of known Urls so validations are not duplicated in the future. _scheduler.AddKnownUri(uri); } }
protected virtual bool ShouldCrawlPageLinks(CrawledPage crawledPage) { CrawlDecision shouldCrawlPageLinksDecision = _crawlDecisionMaker.ShouldCrawlPageLinks(crawledPage, _crawlContext); if (shouldCrawlPageLinksDecision.Allow) shouldCrawlPageLinksDecision = (_shouldCrawlPageLinksDecisionMaker != null) ? _shouldCrawlPageLinksDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision { Allow = true }; if (!shouldCrawlPageLinksDecision.Allow) { _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldCrawlPageLinksDecision.Reason); FirePageLinksCrawlDisallowedEventAsync(crawledPage, shouldCrawlPageLinksDecision.Reason); FirePageLinksCrawlDisallowedEvent(crawledPage, shouldCrawlPageLinksDecision.Reason); } SignalCrawlStopIfNeeded(shouldCrawlPageLinksDecision); return shouldCrawlPageLinksDecision.Allow; }
protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage) { //TODO No unit tests cover these lines CrawlDecision shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext); if (shouldRecrawlPageDecision.Allow) shouldRecrawlPageDecision = (_shouldRecrawlPageDecisionMaker != null) ? _shouldRecrawlPageDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision { Allow = true }; if (!shouldRecrawlPageDecision.Allow) { _logger.DebugFormat("Page [{0}] not recrawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldRecrawlPageDecision.Reason); } else { // Look for the Retry-After header in the response. crawledPage.RetryAfter = null; if (crawledPage.HttpWebResponse != null && crawledPage.HttpWebResponse.Headers != null) { string value = crawledPage.HttpWebResponse.GetResponseHeader("Retry-After"); if (!String.IsNullOrEmpty(value)) { // Try to convert to DateTime first, then in double. DateTime date; double seconds; if (crawledPage.LastRequest.HasValue && DateTime.TryParse(value, out date)) { crawledPage.RetryAfter = (date - crawledPage.LastRequest.Value).TotalSeconds; } else if (double.TryParse(value, out seconds)) { crawledPage.RetryAfter = seconds; } } } } SignalCrawlStopIfNeeded(shouldRecrawlPageDecision); return shouldRecrawlPageDecision.Allow; }
public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (crawledPage.HttpWebResponse == null) return new CrawlDecision { Allow = false, Reason = "Null HttpWebResponse" }; if (crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) return new CrawlDecision { Allow = false, Reason = "HttpStatusCode is not 200" }; string pageContentType = crawledPage.HttpWebResponse.ContentType.ToLower().Trim(); bool isDownloadable = false; List<string> cleanDownloadableContentTypes = crawlContext.CrawlConfiguration.DownloadableContentTypes .Split(',') .Select(t => t.Trim()) .Where(t => !string.IsNullOrEmpty(t)) .ToList(); foreach (string downloadableContentType in cleanDownloadableContentTypes) { if (pageContentType.Contains(downloadableContentType.ToLower().Trim())) { isDownloadable = true; break; } } if (!isDownloadable) return new CrawlDecision { Allow = false, Reason = "Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes) }; if (crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.HttpWebResponse.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes) return new CrawlDecision { Allow = false, Reason = string.Format("Page size of [{0}] bytes is above the max allowable of [{1}] bytes", crawledPage.HttpWebResponse.ContentLength, crawlContext.CrawlConfiguration.MaxPageSizeInBytes) }; return new CrawlDecision { Allow = true }; }
/// <summary> /// Validate that the Root page was not redirected. If the root page is redirected, we assume that the root uri /// should be changed to the uri where it was redirected. /// </summary> protected virtual void ValidateRootUriForRedirection(CrawledPage crawledRootPage) { if (!crawledRootPage.IsRoot) { throw new ArgumentException("The crawled page must be the root page to be validated for redirection."); } if (IsRedirect(crawledRootPage)) { _crawlContext.RootUri = ExtractRedirectUri(crawledRootPage); _logger.InfoFormat("The root URI [{0}] was redirected to [{1}]. [{1}] is the new root.", _crawlContext.OriginalRootUri, _crawlContext.RootUri); } }
protected override string GetMetaRobotsValue(CrawledPage crawledPage) { string robotsMeta = null; HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']"); if (robotsNode != null) robotsMeta = robotsNode.GetAttributeValue("content", ""); return robotsMeta; }
protected virtual bool PageSizeIsAboveMax(CrawledPage crawledPage) { bool isAboveMax = false; if (_crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.Content.Bytes != null && crawledPage.Content.Bytes.Length > _crawlContext.CrawlConfiguration.MaxPageSizeInBytes) { isAboveMax = true; _logger.InfoFormat("Page [{0}] has a page size of [{1}] bytes which is above the [{2}] byte max, no further processing will occur for this page", crawledPage.Uri, crawledPage.Content.Bytes.Length, _crawlContext.CrawlConfiguration.MaxPageSizeInBytes); } return isAboveMax; }
public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (crawledPage.WebException == null) return new CrawlDecision { Allow = false, Reason = "WebException did not occur"}; if (crawlContext.CrawlConfiguration.MaxRetryCount < 1) return new CrawlDecision { Allow = false, Reason = "MaxRetryCount is less than 1"}; if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount) return new CrawlDecision {Allow = false, Reason = "MaxRetryCount has been reached"}; return new CrawlDecision { Allow = true }; }