public CrawlConfiguration Convert() { AutoMapper.Mapper.CreateMap<CrawlBehaviorElement, CrawlConfiguration>(); AutoMapper.Mapper.CreateMap<PolitenessElement, CrawlConfiguration>(); AutoMapper.Mapper.CreateMap<AuthorizationElement, CrawlConfiguration>(); CrawlConfiguration config = new CrawlConfiguration(); AutoMapper.Mapper.Map<CrawlBehaviorElement, CrawlConfiguration>(CrawlBehavior, config); AutoMapper.Mapper.Map<PolitenessElement, CrawlConfiguration>(Politeness, config); AutoMapper.Mapper.Map<AuthorizationElement, CrawlConfiguration>(Authorization, config); foreach (ExtensionValueElement element in ExtensionValues) config.ConfigurationExtensions.Add(element.Key, element.Value); return config; }
public ImageAltTextChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { }
public ImageAltTextChecker(CrawlConfiguration crawlConfiguration) : base(crawlConfiguration, null, null, null, null, null, null, null, null) { }
public CSQueryHyperlinkParser(CrawlConfiguration config, Func<string, string> cleanURLFunc) : base(config, cleanURLFunc) { }
public ImageSrcChecker(CrawlConfiguration crawlConfiguration) : base(crawlConfiguration, null, null, null, null, null, null, null, null) { }
/// <summary> /// Get a property's value from App.config /// </summary> /// <param name="propertyName">The property's key</param> /// <returns>The property's value</returns> public static bool GetConfigurationValue(CrawlConfiguration config, string propertyName, out string propertyValue) { propertyValue = string.Empty; return config.ConfigurationExtensions.TryGetValue(propertyName, out propertyValue); }
public HapHyperLinkParser(CrawlConfiguration config, Func<string, string> cleanURLFunc) : base(config, cleanURLFunc) { }
protected virtual void PrintConfigValues(CrawlConfiguration config) { _logger.Info("Configuration Values:"); string indentString = new string(' ', 2); string abotVersion = Assembly.GetAssembly(this.GetType()).GetName().Version.ToString(); _logger.InfoFormat("{0}Abot Version: {1}", indentString, abotVersion); foreach (PropertyInfo property in config.GetType().GetProperties()) { if (property.Name != "ConfigurationExtensions") _logger.InfoFormat("{0}{1}: {2}", indentString, property.Name, property.GetValue(config, null)); } foreach (string key in config.ConfigurationExtensions.Keys) { _logger.InfoFormat("{0}{1}: {2}", indentString, key, config.ConfigurationExtensions[key]); } }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration, null); _crawlContext.Scheduler = _scheduler; }
public AriaLabelChecker(CrawlConfiguration crawlConfiguration) : base(crawlConfiguration, null, null, null, null, null, null, null, null) { }