Ejemplo n.º 1
1
        // Setting up bot config
        public void setup_abot()
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds = 150;
            crawlConfig.MaxConcurrentThreads = 25;
            crawlConfig.IsExternalPageCrawlingEnabled = false;
            crawlConfig.MaxCrawlDepth = 1;
            crawlConfig.MaxPagesToCrawl = 1000;
            crawlConfig.UserAgentString = "abot v1.0 http://code.google.com/p/abot";

            crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);

            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                Regex rx = new Regex(@"\d{5}");

                if (!rx.IsMatch(pageToCrawl.Uri.ToString()) && !pageToCrawl.Uri.ToString().Contains("text="))
                    return new CrawlDecision { Allow = false, Reason = "Want only comlinks" };

                return new CrawlDecision { Allow = true, Reason = "OK Link" }; ;
            });
        }
Ejemplo n.º 2
0
        private static IWebCrawler GetManuallyConfiguredWebCrawler()
        {
            //Create a config object manually
            CrawlConfiguration config = new CrawlConfiguration();
            config.CrawlTimeoutSeconds = 0;
            config.DownloadableContentTypes = "text/html, text/plain";
            config.IsExternalPageCrawlingEnabled = false;
            config.IsExternalPageLinksCrawlingEnabled = false;
            config.IsRespectRobotsDotTextEnabled = false;
            config.IsUriRecrawlingEnabled = false;
            config.MaxConcurrentThreads = 10;
            config.MaxPagesToCrawl = 10;
            config.MaxPagesToCrawlPerDomain = 0;
            config.MinCrawlDelayPerDomainMilliSeconds = 1000;
            config.UserAgentString = "abot v@ABOTASSEMBLYVERSION@ http://code.google.com/p/abot";

            //Add you own values without modifying Abot's source code.
            //These are accessible in CrawlContext.CrawlConfuration.ConfigurationException object throughout the crawl
            config.ConfigurationExtensions.Add("Somekey1", "SomeValue1");
            config.ConfigurationExtensions.Add("Somekey2", "SomeValue2");

            //Initialize the crawler with custom configuration created above.
            //This override the app.config file values
            return new PoliteWebCrawler(config, null, null, null, null, null, null, null, null);
        }
Ejemplo n.º 3
0
        public void Constructor_ValidUri_CreatesInstance()
        {
            CrawlConfiguration unitUnderTest = new CrawlConfiguration();

            Assert.IsNotNull(unitUnderTest.ConfigurationExtensions);
            Assert.AreEqual(0, unitUnderTest.ConfigurationExtensions.Count);
            Assert.AreEqual(0, unitUnderTest.CrawlTimeoutSeconds);
            Assert.AreEqual("text/html", unitUnderTest.DownloadableContentTypes);
            Assert.AreEqual(false, unitUnderTest.IsExternalPageCrawlingEnabled);
            Assert.AreEqual(false, unitUnderTest.IsExternalPageLinksCrawlingEnabled);
            Assert.AreEqual(false, unitUnderTest.IsRespectRobotsDotTextEnabled);
            Assert.AreEqual(false, unitUnderTest.IsUriRecrawlingEnabled);
            Assert.AreEqual(10, unitUnderTest.MaxConcurrentThreads);
            Assert.AreEqual(5, unitUnderTest.MaxRobotsDotTextCrawlDelayInSeconds);
            Assert.AreEqual(1000, unitUnderTest.MaxPagesToCrawl);
            Assert.AreEqual(0, unitUnderTest.MaxPagesToCrawlPerDomain);
            Assert.AreEqual(0, unitUnderTest.MinCrawlDelayPerDomainMilliSeconds);
            Assert.AreEqual("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v@ABOTASSEMBLYVERSION@ http://code.google.com/p/abot)", unitUnderTest.UserAgentString);
            Assert.AreEqual("abot", unitUnderTest.RobotsDotTextUserAgentString);
            Assert.AreEqual(0, unitUnderTest.MaxPageSizeInBytes);
            Assert.AreEqual(0, unitUnderTest.HttpServicePointConnectionLimit);
            Assert.AreEqual(0, unitUnderTest.HttpRequestTimeoutInSeconds);
            Assert.AreEqual(7, unitUnderTest.HttpRequestMaxAutoRedirects);
            Assert.AreEqual(true, unitUnderTest.IsHttpRequestAutoRedirectsEnabled);
            Assert.AreEqual(false, unitUnderTest.IsHttpRequestAutomaticDecompressionEnabled);
            Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageCacheTimeInSeconds);
            Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageInMb);
            Assert.AreEqual(0, unitUnderTest.MinAvailableMemoryRequiredInMb);
            Assert.AreEqual(100, unitUnderTest.MaxCrawlDepth);
        }
Ejemplo n.º 4
0
 public static void CrawlerInit()
 {
     _crawledPages = new Dictionary<string, CrawledWebPage>();
     _crawlConfig = new CrawlConfiguration
                     {
                         CrawlTimeoutSeconds = 100,
                         MaxConcurrentThreads = 10,
                         MaxPagesToCrawl = 1000,
                         UserAgentString = "abot v1.0 http://code.google.com/p/abot",
                         DownloadableContentTypes = "text/html, text/plain",
                         IsUriRecrawlingEnabled = false,
                         IsExternalPageCrawlingEnabled = true,
                         IsExternalPageLinksCrawlingEnabled = true,
                         HttpServicePointConnectionLimit = 200,
                         HttpRequestTimeoutInSeconds = 15,
                         HttpRequestMaxAutoRedirects = 7,
                         IsHttpRequestAutoRedirectsEnabled = true,
                         IsHttpRequestAutomaticDecompressionEnabled = true,
                         MinAvailableMemoryRequiredInMb = 0,
                         MaxMemoryUsageInMb = 200,
                         MaxMemoryUsageCacheTimeInSeconds = 2,
                         MaxCrawlDepth = 5,//10,
                         IsRespectRobotsDotTextEnabled = true
                     };
 }
Ejemplo n.º 5
0
        public void Crawl(Uri uri, Action<Page> callback)
        {
            var crawlConfig = new CrawlConfiguration
            {
                CrawlTimeoutSeconds = 0,
                MaxConcurrentThreads = 5,
                UserAgentString = "InspectionCrawler v1.0",
                MinCrawlDelayPerDomainMilliSeconds = 1000,
                MaxPagesToCrawl = 0,
                MaxPagesToCrawlPerDomain = 0,
                MaxCrawlDepth = int.MaxValue
            };

            var crawler = new PoliteWebCrawler(crawlConfig);

            crawler.PageCrawlCompletedAsync += (sender, args) =>
            {
                var page = args.CrawledPage;

                if (page.WebException != null && page.HttpWebResponse == null)
                {
                    _log.Log(new LogMessage(LogType.Error, "Could not get page", page.WebException, page.Uri));
                    return;
                }

                callback(Convert(args.CrawledPage));

            };

            crawler.Crawl(uri);
        }
Ejemplo n.º 6
0
        static void Main(string[] args)
        {
            CrawlConfiguration config = new CrawlConfiguration();
            config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe.

            // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe.
            // Make sure to dispose this instance or you will have a zombie process!
            IWebDriver driver = CreatePhantomJsDriver(config);

            // Create the content extractor that uses PhantomJS.
            IWebContentExtractor extractor = new JavaScriptContentExtractor(driver);

            // Create a PageRequester that will use the extractor.
            IPageRequester requester = new PageRequester(config, extractor);

            using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) {
                crawler.PageCrawlCompleted += OnPageCrawlCompleted;

                CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/"));
                if (result.ErrorOccurred)
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                else
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }

            Console.Read();
        }
 static void Main(string[] args) {
     var proxies = new WebProxy[] { new WebProxy("http://192.168.1.1:3128"), new WebProxy("http://192.168.1.2:3128") };
     var crawlConfig = new CrawlConfiguration();
     var domainLimiter = new MultiProxyDomainRateLimiter(crawlConfig.MinCrawlDelayPerDomainMilliSeconds);
     var pageRequester = new MultiProxyPageRequester(crawlConfig, new WebContentExtractor(), proxies);
     var crawler = new MultiProxyPoliteWebCrawler(crawlConfig, null, null, null, pageRequester, null, null, domainLimiter, null);
     crawler.Crawl(new Uri("http://localhost/"));
 }
Ejemplo n.º 8
0
        public PageRequester(CrawlConfiguration config)
        {
            if (config == null)
                throw new ArgumentNullException("config");

            _userAgentString = config.UserAgentString.Replace("@ABOTASSEMBLYVERSION@", Assembly.GetAssembly(this.GetType()).GetName().Version.ToString());
            _config = config;

            if (_config.HttpServicePointConnectionLimit > 0)
                ServicePointManager.DefaultConnectionLimit = _config.HttpServicePointConnectionLimit;
        }
Ejemplo n.º 9
0
        public PageRequester(CrawlConfiguration config, IWebContentExtractor contentExtractor)
        {
            if (config == null)
                throw new ArgumentNullException("config");

            _config = config;

            if (_config.HttpServicePointConnectionLimit > 0)
                ServicePointManager.DefaultConnectionLimit = _config.HttpServicePointConnectionLimit;

            _extractor = contentExtractor ?? new WebContentExtractor();
        }
Ejemplo n.º 10
0
        public CrawlConfiguration Convert()
        {
            AutoMapper.Mapper.CreateMap<CrawlBehaviorElement, CrawlConfiguration>();
            AutoMapper.Mapper.CreateMap<PolitenessElement, CrawlConfiguration>();

            CrawlConfiguration config = new CrawlConfiguration();
            AutoMapper.Mapper.Map<CrawlBehaviorElement, CrawlConfiguration>(CrawlBehavior, config);
            AutoMapper.Mapper.Map<PolitenessElement, CrawlConfiguration>(Politeness, config);

            foreach (ExtensionValueElement element in ExtensionValues)
                config.ConfigurationExtensions.Add(element.Key, element.Value);

            return config;
        }
Ejemplo n.º 11
0
 public PoliteWebCrawler(
     CrawlConfiguration crawlConfiguration,
     ICrawlDecisionMaker crawlDecisionMaker,
     IThreadManager threadManager,
     IScheduler scheduler,
     IPageRequester pageRequester,
     IHyperLinkParser hyperLinkParser,
     IMemoryManager memoryManager,
     IDomainRateLimiter domainRateLimiter,
     IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager)
 {
     _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds);
     _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration));
 }
Ejemplo n.º 12
0
        public void Crawl_CrawlTimeoutIs1Sec_TimesOut()
        {
            CrawlConfiguration configuration = new CrawlConfiguration();
            configuration.CrawlTimeoutSeconds = 2;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            CrawlResult result = crawler.Crawl(new Uri("http://localhost.fiddler:1111/"));

            Assert.IsFalse(result.ErrorOccurred);
            Assert.IsTrue(result.Elapsed.TotalSeconds < 8, "Took more than 8 seconds");
            Assert.IsTrue(pagesCrawledCount < 2, "Crawled more than 2 pages");
        }
Ejemplo n.º 13
0
        public void Crawl_CrawlTimeoutIs1Sec_TimesOut()
        {
            CrawlConfiguration configuration = new CrawlConfiguration ();
            configuration.CrawlTimeoutSeconds = 1;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler (configuration, null, null, null, null, null, null, null, null);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            CrawlResult result = crawler.Crawl (new Uri ("http://wvtesting2.com/"));

            Assert.IsFalse (result.ErrorOccurred);
            Assert.IsTrue (result.Elapsed.TotalSeconds < 5);
            Assert.IsTrue (pagesCrawledCount > 0);
        }
Ejemplo n.º 14
0
        public void Crawl_MaxPagesTo25_OnlyCrawls25Pages()
        {
            new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();
            configuration.MaxPagesToCrawl = 25;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            crawler.Crawl(new Uri("http://localhost.fiddler:1111/"));

            Assert.AreEqual(25, pagesCrawledCount);
        }
Ejemplo n.º 15
0
        public PageRequester(CrawlConfiguration config, IWebContentExtractor contentExtractor)
        {
            if (config == null)
                throw new ArgumentNullException("config");

            _config = config;

            if (_config.HttpServicePointConnectionLimit > 0)
                ServicePointManager.DefaultConnectionLimit = _config.HttpServicePointConnectionLimit;

            if (!_config.IsSslCertificateValidationEnabled)
                ServicePointManager.ServerCertificateValidationCallback +=
                    (sender, certificate, chain, sslPolicyErrors) => true;

            _extractor = contentExtractor ?? new WebContentExtractor();
        }
Ejemplo n.º 16
0
        public void SetUp()
        {
            _fakeHyperLinkParser = new Mock<IHyperLinkParser>();
            _fakeHttpRequester = new Mock<IPageRequester>();
            _fakeCrawlDecisionMaker = new Mock<ICrawlDecisionMaker>();
            _fakeDomainRateLimiter = new Mock<IDomainRateLimiter>();
            _fakeMemoryManager = new Mock<IMemoryManager>();
            _fakeRobotsDotTextFinder = new Mock<IRobotsDotTextFinder>();
            _fakeRobotsDotText = new Mock<IRobotsDotText>();

            _dummyScheduler = new Scheduler();
            _dummyThreadManager = new ManualThreadManager(1);
            _dummyConfiguration = new CrawlConfiguration();
            _dummyConfiguration.ConfigurationExtensions.Add("somekey", "someval");

            _rootUri = new Uri("http://a.com/");
        }
Ejemplo n.º 17
0
        public void Crawl_MaxPagesTo5_WithCrawlDelay_OnlyCrawls5Pages()
        {
            new PageRequester(new CrawlConfiguration{ UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();
            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; //adding delay since it increases the chance of issues with abot crawling more than MaxPagesToCrawl.
            configuration.MaxPagesToCrawl = 5;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            crawler.Crawl(new Uri("http://localhost.fiddler:1111/"));

            Assert.AreEqual(5, pagesCrawledCount);
        }
        private static IWebCrawler GetManuallyConfiguredWebCrawler()
        {
            var crawlConfiguration = new CrawlConfiguration();

            crawlConfiguration.CrawlTimeoutSeconds = 0;
            crawlConfiguration.DownloadableContentTypes = "text/html, text/plain";
            crawlConfiguration.IsExternalPageCrawlingEnabled = false;
            crawlConfiguration.IsExternalPageLinksCrawlingEnabled = false;
            crawlConfiguration.IsRespectRobotsDotTextEnabled = false;
            crawlConfiguration.IsUriRecrawlingEnabled = false;
            crawlConfiguration.MaxConcurrentThreads = 10;
            crawlConfiguration.MaxPagesToCrawl = 10;
            crawlConfiguration.MaxPagesToCrawlPerDomain = 0;
            crawlConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1000;

            crawlConfiguration.ConfigurationExtensions.Add("Somekey1", "SomeValue1");
            crawlConfiguration.ConfigurationExtensions.Add("Somekey2", "SomeValue2");

            return new PoliteWebCrawler(crawlConfiguration, null, null, null, null, null, null, null, null);
        }
Ejemplo n.º 19
0
        public void SetUp()
        {
            _fakeHyperLinkParser = new Mock<IHyperLinkParser>();
            _fakeHttpRequester = new Mock<IPageRequester>();
            _fakeCrawlDecisionMaker = new Mock<ICrawlDecisionMaker>();
            _fakeMemoryManager = new Mock<IMemoryManager>();
            _fakeDomainRateLimiter = new Mock<IDomainRateLimiter>();
            _fakeRobotsDotTextFinder = new Mock<IRobotsDotTextFinder>();


            _dummyScheduler = new Scheduler();
            _dummyThreadManager = new TaskThreadManager(10);
            _dummyConfiguration = new CrawlConfiguration();
            _dummyConfiguration.ConfigurationExtensions.Add("somekey", "someval");

            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);
            _unitUnderTest.CrawlBag.SomeVal = "SomeVal";
            _unitUnderTest.CrawlBag.SomeList = new List<string>() { "a", "b" };
            _rootUri = new Uri("http://a.com/");
        }
Ejemplo n.º 20
0
        private static IWebDriver CreatePhantomJsDriver(CrawlConfiguration p_Config)
        {
            // Optional options passed to the PhantomJS process.
            PhantomJSOptions options = new PhantomJSOptions();
            options.AddAdditionalCapability("phantomjs.page.settings.userAgent", p_Config.UserAgentString);
            options.AddAdditionalCapability("phantomjs.page.settings.javascriptCanCloseWindows", false);
            options.AddAdditionalCapability("phantomjs.page.settings.javascriptCanOpenWindows", false);
            options.AddAdditionalCapability("acceptSslCerts", !p_Config.IsSslCertificateValidationEnabled);

            // Basic auth credentials.
            options.AddAdditionalCapability("phantomjs.page.settings.userName", p_Config.LoginUser);
            options.AddAdditionalCapability("phantomjs.page.settings.password", p_Config.LoginPassword);

            // Create the service while hiding the prompt window.
            PhantomJSDriverService service = PhantomJSDriverService.CreateDefaultService();
            service.HideCommandPromptWindow = true;
            IWebDriver driver = new PhantomJSDriver(service, options);

            return driver;
        }
Ejemplo n.º 21
0
        public void Constructor_ValidUri_CreatesInstance()
        {
            CrawlConfiguration unitUnderTest = new CrawlConfiguration();

            Assert.IsNotNull(unitUnderTest.ConfigurationExtensions);
            Assert.AreEqual(0, unitUnderTest.ConfigurationExtensions.Count);
            Assert.AreEqual(0, unitUnderTest.CrawlTimeoutSeconds);
            Assert.AreEqual("text/html", unitUnderTest.DownloadableContentTypes);
            Assert.AreEqual(false, unitUnderTest.IsExternalPageCrawlingEnabled);
            Assert.AreEqual(false, unitUnderTest.IsExternalPageLinksCrawlingEnabled);
            Assert.AreEqual(false, unitUnderTest.IsRespectRobotsDotTextEnabled);
            Assert.AreEqual(false, unitUnderTest.IsRespectMetaRobotsNoFollowEnabled);
            Assert.AreEqual(false, unitUnderTest.IsRespectAnchorRelNoFollowEnabled);
            Assert.AreEqual(false, unitUnderTest.IsUriRecrawlingEnabled);
            Assert.AreEqual(10, unitUnderTest.MaxConcurrentThreads);
            Assert.AreEqual(5, unitUnderTest.MaxRobotsDotTextCrawlDelayInSeconds);
            Assert.AreEqual(1000, unitUnderTest.MaxPagesToCrawl);
            Assert.AreEqual(0, unitUnderTest.MaxPagesToCrawlPerDomain);
            Assert.AreEqual(0, unitUnderTest.MinCrawlDelayPerDomainMilliSeconds);
            Assert.AreEqual("Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko", unitUnderTest.UserAgentString);
            Assert.AreEqual("abot", unitUnderTest.RobotsDotTextUserAgentString);
            Assert.AreEqual(0, unitUnderTest.MaxPageSizeInBytes);
            Assert.AreEqual(200, unitUnderTest.HttpServicePointConnectionLimit);
            Assert.AreEqual(15, unitUnderTest.HttpRequestTimeoutInSeconds);
            Assert.AreEqual(7, unitUnderTest.HttpRequestMaxAutoRedirects);
            Assert.AreEqual(true, unitUnderTest.IsHttpRequestAutoRedirectsEnabled);
            Assert.AreEqual(false, unitUnderTest.IsHttpRequestAutomaticDecompressionEnabled);
            Assert.AreEqual(false, unitUnderTest.IsSendingCookiesEnabled);
            Assert.AreEqual(true, unitUnderTest.IsSslCertificateValidationEnabled);
            Assert.AreEqual(false, unitUnderTest.IsRespectUrlNamedAnchorOrHashbangEnabled);
            Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageCacheTimeInSeconds);
            Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageInMb);
            Assert.AreEqual(0, unitUnderTest.MinAvailableMemoryRequiredInMb);
            Assert.AreEqual(100, unitUnderTest.MaxCrawlDepth);
            Assert.AreEqual(false, unitUnderTest.IsForcedLinkParsingEnabled);
            Assert.AreEqual(0, unitUnderTest.MaxRetryCount);
            Assert.AreEqual(0, unitUnderTest.MinRetryDelayInMilliseconds);
            Assert.AreEqual(null, unitUnderTest.LoginUser);
            Assert.AreEqual(null, unitUnderTest.LoginPassword);
            Assert.AreEqual(false, unitUnderTest.IsAlwaysLogin);
        }
Ejemplo n.º 22
0
        public void Crawl_IsRateLimited()
        {
            new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();
            configuration.MaxPagesToCrawl = 3;
            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds

            int pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            var uriToCrawl = new Uri("http://localhost.fiddler:1111/");
            var start = DateTime.Now;
            crawler.Crawl(uriToCrawl);
            var elapsed = DateTime.Now - start;

            Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000);
            Assert.AreEqual(3, pagesCrawledCount);
        }
Ejemplo n.º 23
0
        public CSQueryHyperlinkParser(CrawlConfiguration config, Func<string, string> cleanURLFunc)
            : base(config, cleanURLFunc)
        {

        }
Ejemplo n.º 24
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="httpRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration, 
            ICrawlDecisionMaker crawlDecisionMaker, 
            IThreadManager threadManager, 
            IScheduler scheduler, 
            IPageRequester httpRequester, 
            IHyperLinkParser hyperLinkParser, 
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads);
            _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled);
            _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if(_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0
                || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser();

            _crawlContext.Scheduler = _scheduler;
        }
Ejemplo n.º 25
0
 public PoliteWebCrawler(CrawlConfiguration crawlConfiguration)
     : this(crawlConfiguration, null, null, null, null, null, null, null, null)
 {
 }
Ejemplo n.º 26
0
 public PageRequesterWrapper(CrawlConfiguration config)
     : base(config)
 {
 }
Ejemplo n.º 27
0
        public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_IsRetry_ReturnsTrue()
        {
            Uri uri = new Uri("http://a.com/");
            CrawlConfiguration config = new CrawlConfiguration
            {
                MaxPagesToCrawlPerDomain = 100
            };
            ConcurrentDictionary<string, int> countByDomain = new ConcurrentDictionary<string, int>();
            countByDomain.TryAdd(uri.Authority, 100);
            CrawlContext crawlContext = new CrawlContext
            {
                CrawlConfiguration = config,
                CrawlStartDate = DateTime.Now,
                CrawlCountByDomain = countByDomain
            };

            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(
                new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage"))
                {
                    IsRetry = true,
                    IsInternal = true
                },
                crawlContext);

            Assert.IsTrue(result.Allow);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
Ejemplo n.º 28
0
 public void Dispose()
 {
     if (_extractor != null)
     {
         _extractor.Dispose();
     }
     _cookieContainer = null;
     _config = null;
 }
Ejemplo n.º 29
0
        protected virtual void PrintConfigValues(CrawlConfiguration config)
        {
            _logger.Info("Configuration Values:");

            string indentString = new string(' ', 2);
            foreach (PropertyInfo property in config.GetType().GetProperties())
            {
                if (property.Name != "ConfigurationExtensions")
                    _logger.InfoFormat("{0}{1}: {2}", indentString, property.Name, property.GetValue(config, null));
            }

            foreach (string key in config.ConfigurationExtensions.Keys)
            {
                _logger.InfoFormat("{0}{1}: {2}", indentString, key, config.ConfigurationExtensions[key]);
            }
        }
Ejemplo n.º 30
0
 public PageRequester(CrawlConfiguration config)
     : this(config, null)
 {
 }