/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled); _crawlContext.Scheduler = _scheduler; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }
public NodeParser(IHyperLinkParser hyperLinkParser, IVertexFactory vertexFactory, IProfileFactory profileFactory, IVertexCache cache, IPause pause) { _hyperLinkParser = hyperLinkParser; _vertexFactory = vertexFactory; _profileFactory = profileFactory; _cache = cache; _pause = pause; }
/// <summary> /// /// </summary> /// <param name="crawlConfiguration"></param> /// <param name="crawlDecisionMaker"></param> /// <param name="threadManager"></param> /// <param name="scheduler"></param> /// <param name="pageRequester"></param> /// <param name="hyperLinkParser"></param> /// <param name="memoryManager"></param> /// <param name="domainRateLimiter"></param> /// <param name="robotsDotTextFinder"></param> public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration)); }
public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration)); }
/// <summary> /// Creates a crawler instance with custom settings or implementation /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> /// <param name="domainRateLimiter"></param> /// <param name="robotsDotTextFinder"></param> /// <param name="sitemapLoader"></param> public GoogleWebCrawler( CrawlConfiguration crawlConfiguration = null, ICrawlDecisionMaker crawlDecisionMaker = null, IThreadManager threadManager = null, IScheduler scheduler = null, IPageRequester pageRequester = null, IHyperLinkParser hyperLinkParser = null, IMemoryManager memoryManager = null, IDomainRateLimiter domainRateLimiter = null, IRobotsDotTextFinder robotsDotTextFinder = null, IRobotsSitemapLoader sitemapLoader = null) : base(SetConfig(crawlConfiguration), crawlDecisionMaker, threadManager, scheduler, pageRequester ?? new BrowserPageRequester(crawlConfiguration), hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { SitemapLoader = sitemapLoader ?? new RobotsSitemapLoader(); }
public MultiProxyPoliteWebCrawler( MultiProxyCrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IMultiProxyDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager) { if ((pageRequester as MultiProxyPageRequester) == null) _pageRequester = new MultiProxyPageRequester(crawlConfiguration); _domainRateLimiter = domainRateLimiter ?? new MultiProxyDomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(_pageRequester); }
public ImageAltTextChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { }
public WebChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if(_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled, null, _crawlContext.CrawlConfiguration.IsRespectUrlNamedAnchorOrHashbangEnabled); _crawlContext.Scheduler = _scheduler; }