/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="htmlParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHtmlParser htmlParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext { CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration() }; CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor()); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _htmlParser = htmlParser ?? new AngleSharpHyperlinkParser(_crawlContext.CrawlConfiguration, null); _crawlContext.Scheduler = _scheduler; }
public RobotsDotTextFinder(IPageRequester pageRequester) { if (pageRequester == null) throw new ArgumentNullException("pageRequester"); _pageRequester = pageRequester; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled); _crawlContext.Scheduler = _scheduler; }
public RobotsDotTextFinder(IPageRequester pageRequester) { if (pageRequester == null) { throw new ArgumentNullException("pageRequester"); } _pageRequester = pageRequester; }
public SiteCrawler( IPageRequester pageRequester, IScheduler scheduler, ILinkParser linkParser) { _pageRequester = pageRequester ?? new DefaultPageUrlRequester(); _scheduler = scheduler ?? new DefaultScheduler(); _linkParser = linkParser ?? new DefaultLinkParser(); }
public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IScheduler scheduler, IPageRequester pageRequester, IHtmlParser htmlParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, scheduler, pageRequester, htmlParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor())); }
/// <summary> /// Creates a crawler instance with custom settings or implementation /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> /// <param name="domainRateLimiter"></param> /// <param name="robotsDotTextFinder"></param> /// <param name="sitemapLoader"></param> public GoogleWebCrawler( CrawlConfiguration crawlConfiguration = null, ICrawlDecisionMaker crawlDecisionMaker = null, IThreadManager threadManager = null, IScheduler scheduler = null, IPageRequester pageRequester = null, IHyperLinkParser hyperLinkParser = null, IMemoryManager memoryManager = null, IDomainRateLimiter domainRateLimiter = null, IRobotsDotTextFinder robotsDotTextFinder = null, IRobotsSitemapLoader sitemapLoader = null) : base(SetConfig(crawlConfiguration), crawlDecisionMaker, threadManager, scheduler, pageRequester ?? new BrowserPageRequester(crawlConfiguration), hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { SitemapLoader = sitemapLoader ?? new RobotsSitemapLoader(); }
public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, httpRequester, hyperLinkParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration)); }
public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration)); }
public MultiProxyPoliteWebCrawler( MultiProxyCrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IMultiProxyDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager) { if ((pageRequester as MultiProxyPageRequester) == null) _pageRequester = new MultiProxyPageRequester(crawlConfiguration); _domainRateLimiter = domainRateLimiter ?? new MultiProxyDomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(_pageRequester); }
public WebCrawler(ILogger <WebCrawler> logger, CrawlConfiguration crawlConfiguration, IThreadManager threadManager, ICrawlDecisionMaker crawlDecisionMaker, ICrawlScheduler crawlScheduler, IPageRequester pageRequester, IDocumentParser hyperLinkParser, IRateLimiter rateLimiter) { _logger = logger; _crawlConfiguration = crawlConfiguration; _crawlContext = new CrawlContext(logger) { CrawlConfiguration = crawlConfiguration }; _threadManager = threadManager; _crawlDecisionMaker = crawlDecisionMaker; _scheduler = crawlScheduler; _pageRequester = pageRequester; _hyperLinkParser = hyperLinkParser; _rateLimiter = rateLimiter; PageCrawlStartingAsync += WebCrawler_PageCrawlStartingAsync; }
public ImageAltTextChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { }
public WebChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { }
/// <summary> /// Set start configuration /// </summary> /// <param name="pageRequester"></param> public RobotsDotTextFinder(IPageRequester pageRequester) { PageRequester = pageRequester ?? throw new ArgumentNullException(nameof(pageRequester)); }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled, null, _crawlContext.CrawlConfiguration.IsRespectUrlNamedAnchorOrHashbangEnabled); _crawlContext.Scheduler = _scheduler; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if(_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }
public ThinkWebCrawler(ILogger <WebCrawler> logger, CrawlConfiguration crawlConfiguration, IThreadManager threadManager, ICrawlDecisionMaker crawlDecisionMaker, ICrawlScheduler crawlScheduler, IPageRequester pageRequester, IDocumentParser hyperLinkParser, IRateLimiter rateLimiter, IOptions <ThinkCrawlConfiguration> thinkCrawlOpt) : base(logger, crawlConfiguration, threadManager, crawlDecisionMaker, crawlScheduler, pageRequester, hyperLinkParser, rateLimiter) { _thinkCrawlConfiguration = thinkCrawlOpt.Value; Check(); }
public void SetUp() { _requester = new AwesomiumPageRequester(BuildDir); }