public PageRequester(CrawlConfiguration config) { if (config == null) throw new ArgumentNullException("config"); _userAgentString = config.UserAgentString.Replace("@ABOTASSEMBLYVERSION@", Assembly.GetAssembly(this.GetType()).GetName().Version.ToString()); _config = config; if (_config.HttpServicePointConnectionLimit > 0) ServicePointManager.DefaultConnectionLimit = _config.HttpServicePointConnectionLimit; }
public CrawlConfiguration Convert() { AutoMapper.Mapper.CreateMap<CrawlBehaviorElement, CrawlConfiguration>(); AutoMapper.Mapper.CreateMap<PolitenessElement, CrawlConfiguration>(); CrawlConfiguration config = new CrawlConfiguration(); AutoMapper.Mapper.Map<CrawlBehaviorElement, CrawlConfiguration>(CrawlBehavior, config); AutoMapper.Mapper.Map<PolitenessElement, CrawlConfiguration>(Politeness, config); foreach (ExtensionValueElement element in ExtensionValues) config.ConfigurationExtensions.Add(element.Key, element.Value); return config; }
public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, httpRequester, hyperLinkParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration)); }
protected virtual void PrintConfigValues(CrawlConfiguration config) { _logger.Info("Configuration Values:"); string indentString = new string(' ', 2); foreach (PropertyInfo property in config.GetType().GetProperties()) { if (property.Name != "ConfigurationExtensions") _logger.InfoFormat("{0}{1}: {2}", indentString, property.Name, property.GetValue(config, null)); } foreach (string key in config.ConfigurationExtensions.Keys) { _logger.InfoFormat("{0}{1}: {2}", indentString, key, config.ConfigurationExtensions[key]); } }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }