public CrawlArgs(CrawlContext crawlContext) { if (crawlContext == null) throw new ArgumentNullException("crawlContext"); CrawlContext = crawlContext; }
public PageCrawlCompletedArgs(CrawlContext crawlContext, CrawledPage crawledPage) : base(crawlContext) { if (crawledPage == null) throw new ArgumentNullException("crawledPage"); CrawledPage = crawledPage; }
public PageLinksCrawlDisallowedArgs(CrawlContext crawlContext, CrawledPage crawledPage, string disallowedReason) : base(crawlContext, crawledPage) { if (string.IsNullOrWhiteSpace(disallowedReason)) throw new ArgumentNullException("disallowedReason"); DisallowedReason = disallowedReason; }
public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl) : base(crawlContext) { if (pageToCrawl == null) throw new ArgumentNullException("pageToCrawl"); PageToCrawl = pageToCrawl; }
public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason) : base(crawlContext, pageToCrawl) { if (string.IsNullOrWhiteSpace(disallowedReason)) throw new ArgumentNullException("disallowedReason"); DisallowedReason = disallowedReason; }
public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (string.IsNullOrWhiteSpace(crawledPage.RawContent)) return new CrawlDecision { Allow = false, Reason = "Page has no content" }; if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; return new CrawlDecision { Allow = true }; }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl == null) return new CrawlDecision { Allow = false, Reason = "Null page to crawl" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" }; if (!crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled && crawlContext.CrawledUrls.ContainsKey(pageToCrawl.Uri.AbsoluteUri)) return new CrawlDecision { Allow = false, Reason = "Link already crawled" }; if (crawlContext.CrawledUrls.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { crawlContext.IsCrawlStopRequested = true; return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }; } int pagesCrawledInThisDomain = 0; if (crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) }; } if (!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; return new CrawlDecision { Allow = true }; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }
public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (crawledPage.HttpWebResponse == null) return new CrawlDecision { Allow = false, Reason = "Null HttpWebResponse" }; if (crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) return new CrawlDecision { Allow = false, Reason = "HttpStatusCode is not 200" }; string pageContentType = crawledPage.HttpWebResponse.ContentType.ToLower().Trim(); bool isDownloadable = false; foreach (string downloadableContentType in crawlContext.CrawlConfiguration.DownloadableContentTypes.Split(',')) { if (pageContentType.Contains(downloadableContentType.ToLower().Trim())) { isDownloadable = true; break; } } if (!isDownloadable) return new CrawlDecision { Allow = false, Reason = "Content type is not any of the following: " + crawlContext.CrawlConfiguration.DownloadableContentTypes }; if (crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.HttpWebResponse.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes) return new CrawlDecision { Allow = false, Reason = string.Format("Page size of [{0}] bytes is above the max allowable of [{1}] bytes", crawledPage.PageSizeInBytes, crawlContext.CrawlConfiguration.MaxPageSizeInBytes) }; return new CrawlDecision { Allow = true }; }