// Setting up bot config public void setup_abot() { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 150; crawlConfig.MaxConcurrentThreads = 25; crawlConfig.IsExternalPageCrawlingEnabled = false; crawlConfig.MaxCrawlDepth = 1; crawlConfig.MaxPagesToCrawl = 1000; crawlConfig.UserAgentString = "abot v1.0 http://code.google.com/p/abot"; crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { Regex rx = new Regex(@"\d{5}"); if (!rx.IsMatch(pageToCrawl.Uri.ToString()) && !pageToCrawl.Uri.ToString().Contains("text=")) return new CrawlDecision { Allow = false, Reason = "Want only comlinks" }; return new CrawlDecision { Allow = true, Reason = "OK Link" }; ; }); }
private static IWebCrawler GetManuallyConfiguredWebCrawler() { //Create a config object manually CrawlConfiguration config = new CrawlConfiguration(); config.CrawlTimeoutSeconds = 0; config.DownloadableContentTypes = "text/html, text/plain"; config.IsExternalPageCrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled = false; config.IsRespectRobotsDotTextEnabled = false; config.IsUriRecrawlingEnabled = false; config.MaxConcurrentThreads = 10; config.MaxPagesToCrawl = 10; config.MaxPagesToCrawlPerDomain = 0; config.MinCrawlDelayPerDomainMilliSeconds = 1000; config.UserAgentString = "abot v@ABOTASSEMBLYVERSION@ http://code.google.com/p/abot"; //Add you own values without modifying Abot's source code. //These are accessible in CrawlContext.CrawlConfuration.ConfigurationException object throughout the crawl config.ConfigurationExtensions.Add("Somekey1", "SomeValue1"); config.ConfigurationExtensions.Add("Somekey2", "SomeValue2"); //Initialize the crawler with custom configuration created above. //This override the app.config file values return new PoliteWebCrawler(config, null, null, null, null, null, null, null, null); }
public void Constructor_ValidUri_CreatesInstance() { CrawlConfiguration unitUnderTest = new CrawlConfiguration(); Assert.IsNotNull(unitUnderTest.ConfigurationExtensions); Assert.AreEqual(0, unitUnderTest.ConfigurationExtensions.Count); Assert.AreEqual(0, unitUnderTest.CrawlTimeoutSeconds); Assert.AreEqual("text/html", unitUnderTest.DownloadableContentTypes); Assert.AreEqual(false, unitUnderTest.IsExternalPageCrawlingEnabled); Assert.AreEqual(false, unitUnderTest.IsExternalPageLinksCrawlingEnabled); Assert.AreEqual(false, unitUnderTest.IsRespectRobotsDotTextEnabled); Assert.AreEqual(false, unitUnderTest.IsUriRecrawlingEnabled); Assert.AreEqual(10, unitUnderTest.MaxConcurrentThreads); Assert.AreEqual(5, unitUnderTest.MaxRobotsDotTextCrawlDelayInSeconds); Assert.AreEqual(1000, unitUnderTest.MaxPagesToCrawl); Assert.AreEqual(0, unitUnderTest.MaxPagesToCrawlPerDomain); Assert.AreEqual(0, unitUnderTest.MinCrawlDelayPerDomainMilliSeconds); Assert.AreEqual("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v@ABOTASSEMBLYVERSION@ http://code.google.com/p/abot)", unitUnderTest.UserAgentString); Assert.AreEqual("abot", unitUnderTest.RobotsDotTextUserAgentString); Assert.AreEqual(0, unitUnderTest.MaxPageSizeInBytes); Assert.AreEqual(0, unitUnderTest.HttpServicePointConnectionLimit); Assert.AreEqual(0, unitUnderTest.HttpRequestTimeoutInSeconds); Assert.AreEqual(7, unitUnderTest.HttpRequestMaxAutoRedirects); Assert.AreEqual(true, unitUnderTest.IsHttpRequestAutoRedirectsEnabled); Assert.AreEqual(false, unitUnderTest.IsHttpRequestAutomaticDecompressionEnabled); Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageCacheTimeInSeconds); Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageInMb); Assert.AreEqual(0, unitUnderTest.MinAvailableMemoryRequiredInMb); Assert.AreEqual(100, unitUnderTest.MaxCrawlDepth); }
public static void CrawlerInit() { _crawledPages = new Dictionary<string, CrawledWebPage>(); _crawlConfig = new CrawlConfiguration { CrawlTimeoutSeconds = 100, MaxConcurrentThreads = 10, MaxPagesToCrawl = 1000, UserAgentString = "abot v1.0 http://code.google.com/p/abot", DownloadableContentTypes = "text/html, text/plain", IsUriRecrawlingEnabled = false, IsExternalPageCrawlingEnabled = true, IsExternalPageLinksCrawlingEnabled = true, HttpServicePointConnectionLimit = 200, HttpRequestTimeoutInSeconds = 15, HttpRequestMaxAutoRedirects = 7, IsHttpRequestAutoRedirectsEnabled = true, IsHttpRequestAutomaticDecompressionEnabled = true, MinAvailableMemoryRequiredInMb = 0, MaxMemoryUsageInMb = 200, MaxMemoryUsageCacheTimeInSeconds = 2, MaxCrawlDepth = 5,//10, IsRespectRobotsDotTextEnabled = true }; }
public void Crawl(Uri uri, Action<Page> callback) { var crawlConfig = new CrawlConfiguration { CrawlTimeoutSeconds = 0, MaxConcurrentThreads = 5, UserAgentString = "InspectionCrawler v1.0", MinCrawlDelayPerDomainMilliSeconds = 1000, MaxPagesToCrawl = 0, MaxPagesToCrawlPerDomain = 0, MaxCrawlDepth = int.MaxValue }; var crawler = new PoliteWebCrawler(crawlConfig); crawler.PageCrawlCompletedAsync += (sender, args) => { var page = args.CrawledPage; if (page.WebException != null && page.HttpWebResponse == null) { _log.Log(new LogMessage(LogType.Error, "Could not get page", page.WebException, page.Uri)); return; } callback(Convert(args.CrawledPage)); }; crawler.Crawl(uri); }
static void Main(string[] args) { CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe. // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe. // Make sure to dispose this instance or you will have a zombie process! IWebDriver driver = CreatePhantomJsDriver(config); // Create the content extractor that uses PhantomJS. IWebContentExtractor extractor = new JavaScriptContentExtractor(driver); // Create a PageRequester that will use the extractor. IPageRequester requester = new PageRequester(config, extractor); using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) { crawler.PageCrawlCompleted += OnPageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/")); if (result.ErrorOccurred) Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); else Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } Console.Read(); }
static void Main(string[] args) { var proxies = new WebProxy[] { new WebProxy("http://192.168.1.1:3128"), new WebProxy("http://192.168.1.2:3128") }; var crawlConfig = new CrawlConfiguration(); var domainLimiter = new MultiProxyDomainRateLimiter(crawlConfig.MinCrawlDelayPerDomainMilliSeconds); var pageRequester = new MultiProxyPageRequester(crawlConfig, new WebContentExtractor(), proxies); var crawler = new MultiProxyPoliteWebCrawler(crawlConfig, null, null, null, pageRequester, null, null, domainLimiter, null); crawler.Crawl(new Uri("http://localhost/")); }
public PageRequester(CrawlConfiguration config) { if (config == null) throw new ArgumentNullException("config"); _userAgentString = config.UserAgentString.Replace("@ABOTASSEMBLYVERSION@", Assembly.GetAssembly(this.GetType()).GetName().Version.ToString()); _config = config; if (_config.HttpServicePointConnectionLimit > 0) ServicePointManager.DefaultConnectionLimit = _config.HttpServicePointConnectionLimit; }
public PageRequester(CrawlConfiguration config, IWebContentExtractor contentExtractor) { if (config == null) throw new ArgumentNullException("config"); _config = config; if (_config.HttpServicePointConnectionLimit > 0) ServicePointManager.DefaultConnectionLimit = _config.HttpServicePointConnectionLimit; _extractor = contentExtractor ?? new WebContentExtractor(); }
public CrawlConfiguration Convert() { AutoMapper.Mapper.CreateMap<CrawlBehaviorElement, CrawlConfiguration>(); AutoMapper.Mapper.CreateMap<PolitenessElement, CrawlConfiguration>(); CrawlConfiguration config = new CrawlConfiguration(); AutoMapper.Mapper.Map<CrawlBehaviorElement, CrawlConfiguration>(CrawlBehavior, config); AutoMapper.Mapper.Map<PolitenessElement, CrawlConfiguration>(Politeness, config); foreach (ExtensionValueElement element in ExtensionValues) config.ConfigurationExtensions.Add(element.Key, element.Value); return config; }
public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration)); }
public void Crawl_CrawlTimeoutIs1Sec_TimesOut() { CrawlConfiguration configuration = new CrawlConfiguration(); configuration.CrawlTimeoutSeconds = 2; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; CrawlResult result = crawler.Crawl(new Uri("http://localhost.fiddler:1111/")); Assert.IsFalse(result.ErrorOccurred); Assert.IsTrue(result.Elapsed.TotalSeconds < 8, "Took more than 8 seconds"); Assert.IsTrue(pagesCrawledCount < 2, "Crawled more than 2 pages"); }
public void Crawl_CrawlTimeoutIs1Sec_TimesOut() { CrawlConfiguration configuration = new CrawlConfiguration (); configuration.CrawlTimeoutSeconds = 1; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler (configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; CrawlResult result = crawler.Crawl (new Uri ("http://wvtesting2.com/")); Assert.IsFalse (result.ErrorOccurred); Assert.IsTrue (result.Elapsed.TotalSeconds < 5); Assert.IsTrue (pagesCrawledCount > 0); }
public void Crawl_MaxPagesTo25_OnlyCrawls25Pages() { new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 25; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; crawler.Crawl(new Uri("http://localhost.fiddler:1111/")); Assert.AreEqual(25, pagesCrawledCount); }
public PageRequester(CrawlConfiguration config, IWebContentExtractor contentExtractor) { if (config == null) throw new ArgumentNullException("config"); _config = config; if (_config.HttpServicePointConnectionLimit > 0) ServicePointManager.DefaultConnectionLimit = _config.HttpServicePointConnectionLimit; if (!_config.IsSslCertificateValidationEnabled) ServicePointManager.ServerCertificateValidationCallback += (sender, certificate, chain, sslPolicyErrors) => true; _extractor = contentExtractor ?? new WebContentExtractor(); }
public void SetUp() { _fakeHyperLinkParser = new Mock<IHyperLinkParser>(); _fakeHttpRequester = new Mock<IPageRequester>(); _fakeCrawlDecisionMaker = new Mock<ICrawlDecisionMaker>(); _fakeDomainRateLimiter = new Mock<IDomainRateLimiter>(); _fakeMemoryManager = new Mock<IMemoryManager>(); _fakeRobotsDotTextFinder = new Mock<IRobotsDotTextFinder>(); _fakeRobotsDotText = new Mock<IRobotsDotText>(); _dummyScheduler = new Scheduler(); _dummyThreadManager = new ManualThreadManager(1); _dummyConfiguration = new CrawlConfiguration(); _dummyConfiguration.ConfigurationExtensions.Add("somekey", "someval"); _rootUri = new Uri("http://a.com/"); }
public void Crawl_MaxPagesTo5_WithCrawlDelay_OnlyCrawls5Pages() { new PageRequester(new CrawlConfiguration{ UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; //adding delay since it increases the chance of issues with abot crawling more than MaxPagesToCrawl. configuration.MaxPagesToCrawl = 5; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; crawler.Crawl(new Uri("http://localhost.fiddler:1111/")); Assert.AreEqual(5, pagesCrawledCount); }
private static IWebCrawler GetManuallyConfiguredWebCrawler() { var crawlConfiguration = new CrawlConfiguration(); crawlConfiguration.CrawlTimeoutSeconds = 0; crawlConfiguration.DownloadableContentTypes = "text/html, text/plain"; crawlConfiguration.IsExternalPageCrawlingEnabled = false; crawlConfiguration.IsExternalPageLinksCrawlingEnabled = false; crawlConfiguration.IsRespectRobotsDotTextEnabled = false; crawlConfiguration.IsUriRecrawlingEnabled = false; crawlConfiguration.MaxConcurrentThreads = 10; crawlConfiguration.MaxPagesToCrawl = 10; crawlConfiguration.MaxPagesToCrawlPerDomain = 0; crawlConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1000; crawlConfiguration.ConfigurationExtensions.Add("Somekey1", "SomeValue1"); crawlConfiguration.ConfigurationExtensions.Add("Somekey2", "SomeValue2"); return new PoliteWebCrawler(crawlConfiguration, null, null, null, null, null, null, null, null); }
public void SetUp() { _fakeHyperLinkParser = new Mock<IHyperLinkParser>(); _fakeHttpRequester = new Mock<IPageRequester>(); _fakeCrawlDecisionMaker = new Mock<ICrawlDecisionMaker>(); _fakeMemoryManager = new Mock<IMemoryManager>(); _fakeDomainRateLimiter = new Mock<IDomainRateLimiter>(); _fakeRobotsDotTextFinder = new Mock<IRobotsDotTextFinder>(); _dummyScheduler = new Scheduler(); _dummyThreadManager = new TaskThreadManager(10); _dummyConfiguration = new CrawlConfiguration(); _dummyConfiguration.ConfigurationExtensions.Add("somekey", "someval"); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.CrawlBag.SomeVal = "SomeVal"; _unitUnderTest.CrawlBag.SomeList = new List<string>() { "a", "b" }; _rootUri = new Uri("http://a.com/"); }
private static IWebDriver CreatePhantomJsDriver(CrawlConfiguration p_Config) { // Optional options passed to the PhantomJS process. PhantomJSOptions options = new PhantomJSOptions(); options.AddAdditionalCapability("phantomjs.page.settings.userAgent", p_Config.UserAgentString); options.AddAdditionalCapability("phantomjs.page.settings.javascriptCanCloseWindows", false); options.AddAdditionalCapability("phantomjs.page.settings.javascriptCanOpenWindows", false); options.AddAdditionalCapability("acceptSslCerts", !p_Config.IsSslCertificateValidationEnabled); // Basic auth credentials. options.AddAdditionalCapability("phantomjs.page.settings.userName", p_Config.LoginUser); options.AddAdditionalCapability("phantomjs.page.settings.password", p_Config.LoginPassword); // Create the service while hiding the prompt window. PhantomJSDriverService service = PhantomJSDriverService.CreateDefaultService(); service.HideCommandPromptWindow = true; IWebDriver driver = new PhantomJSDriver(service, options); return driver; }
public void Constructor_ValidUri_CreatesInstance() { CrawlConfiguration unitUnderTest = new CrawlConfiguration(); Assert.IsNotNull(unitUnderTest.ConfigurationExtensions); Assert.AreEqual(0, unitUnderTest.ConfigurationExtensions.Count); Assert.AreEqual(0, unitUnderTest.CrawlTimeoutSeconds); Assert.AreEqual("text/html", unitUnderTest.DownloadableContentTypes); Assert.AreEqual(false, unitUnderTest.IsExternalPageCrawlingEnabled); Assert.AreEqual(false, unitUnderTest.IsExternalPageLinksCrawlingEnabled); Assert.AreEqual(false, unitUnderTest.IsRespectRobotsDotTextEnabled); Assert.AreEqual(false, unitUnderTest.IsRespectMetaRobotsNoFollowEnabled); Assert.AreEqual(false, unitUnderTest.IsRespectAnchorRelNoFollowEnabled); Assert.AreEqual(false, unitUnderTest.IsUriRecrawlingEnabled); Assert.AreEqual(10, unitUnderTest.MaxConcurrentThreads); Assert.AreEqual(5, unitUnderTest.MaxRobotsDotTextCrawlDelayInSeconds); Assert.AreEqual(1000, unitUnderTest.MaxPagesToCrawl); Assert.AreEqual(0, unitUnderTest.MaxPagesToCrawlPerDomain); Assert.AreEqual(0, unitUnderTest.MinCrawlDelayPerDomainMilliSeconds); Assert.AreEqual("Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko", unitUnderTest.UserAgentString); Assert.AreEqual("abot", unitUnderTest.RobotsDotTextUserAgentString); Assert.AreEqual(0, unitUnderTest.MaxPageSizeInBytes); Assert.AreEqual(200, unitUnderTest.HttpServicePointConnectionLimit); Assert.AreEqual(15, unitUnderTest.HttpRequestTimeoutInSeconds); Assert.AreEqual(7, unitUnderTest.HttpRequestMaxAutoRedirects); Assert.AreEqual(true, unitUnderTest.IsHttpRequestAutoRedirectsEnabled); Assert.AreEqual(false, unitUnderTest.IsHttpRequestAutomaticDecompressionEnabled); Assert.AreEqual(false, unitUnderTest.IsSendingCookiesEnabled); Assert.AreEqual(true, unitUnderTest.IsSslCertificateValidationEnabled); Assert.AreEqual(false, unitUnderTest.IsRespectUrlNamedAnchorOrHashbangEnabled); Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageCacheTimeInSeconds); Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageInMb); Assert.AreEqual(0, unitUnderTest.MinAvailableMemoryRequiredInMb); Assert.AreEqual(100, unitUnderTest.MaxCrawlDepth); Assert.AreEqual(false, unitUnderTest.IsForcedLinkParsingEnabled); Assert.AreEqual(0, unitUnderTest.MaxRetryCount); Assert.AreEqual(0, unitUnderTest.MinRetryDelayInMilliseconds); Assert.AreEqual(null, unitUnderTest.LoginUser); Assert.AreEqual(null, unitUnderTest.LoginPassword); Assert.AreEqual(false, unitUnderTest.IsAlwaysLogin); }
public void Crawl_IsRateLimited() { new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 3; configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds int pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; var uriToCrawl = new Uri("http://localhost.fiddler:1111/"); var start = DateTime.Now; crawler.Crawl(uriToCrawl); var elapsed = DateTime.Now - start; Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000); Assert.AreEqual(3, pagesCrawledCount); }
public CSQueryHyperlinkParser(CrawlConfiguration config, Func<string, string> cleanURLFunc) : base(config, cleanURLFunc) { }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if(_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }
public PoliteWebCrawler(CrawlConfiguration crawlConfiguration) : this(crawlConfiguration, null, null, null, null, null, null, null, null) { }
public PageRequesterWrapper(CrawlConfiguration config) : base(config) { }
public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_IsRetry_ReturnsTrue() { Uri uri = new Uri("http://a.com/"); CrawlConfiguration config = new CrawlConfiguration { MaxPagesToCrawlPerDomain = 100 }; ConcurrentDictionary<string, int> countByDomain = new ConcurrentDictionary<string, int>(); countByDomain.TryAdd(uri.Authority, 100); CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = config, CrawlStartDate = DateTime.Now, CrawlCountByDomain = countByDomain }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage")) { IsRetry = true, IsInternal = true }, crawlContext); Assert.IsTrue(result.Allow); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
public void Dispose() { if (_extractor != null) { _extractor.Dispose(); } _cookieContainer = null; _config = null; }
protected virtual void PrintConfigValues(CrawlConfiguration config) { _logger.Info("Configuration Values:"); string indentString = new string(' ', 2); foreach (PropertyInfo property in config.GetType().GetProperties()) { if (property.Name != "ConfigurationExtensions") _logger.InfoFormat("{0}{1}: {2}", indentString, property.Name, property.GetValue(config, null)); } foreach (string key in config.ConfigurationExtensions.Keys) { _logger.InfoFormat("{0}{1}: {2}", indentString, key, config.ConfigurationExtensions[key]); } }
public PageRequester(CrawlConfiguration config) : this(config, null) { }