예제 #1
0
        static void DoCrawl()
        {
            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 10;
            crawlConfig.MaxPagesToCrawl      = 5000;
            crawlConfig.UserAgentString      = "abot v1.0 http://code.google.com/p/abot";
            //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111");
            //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222");

            //Will use app.config for confguration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri("http://sunnah.com/"));

            Console.WriteLine("jumlah crawled content :" + result.CrawlContext.CrawledCount);
            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
예제 #2
0
        public WebsiteIndexer(string host, ICollection <string> ignoredPathes = null, int delayPerRequestMilliSeconds = 1000, int maxPagesToCrawl = 1000)
        {
            _host = host;

            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl = maxPagesToCrawl,
                MinCrawlDelayPerDomainMilliSeconds = delayPerRequestMilliSeconds,
                IsExternalPageCrawlingEnabled      = false
            };

            Crawler = new PoliteWebCrawler(config)
            {
                ShouldCrawlPageDecisionMaker = (pageToCrawl, crawlContext) =>
                {
                    var ignored = string.IsNullOrEmpty(pageToCrawl.Uri?.AbsolutePath) || ignoredPathes?.Any(p => Regex.IsMatch(pageToCrawl.Uri.AbsolutePath, p)) == true;
                    if (ignored)
                    {
                        Console.WriteLine($"Ignored '{pageToCrawl.Uri?.AbsolutePath}'");
                        return(new CrawlDecision {
                            Allow = false, Reason = "Path matches pattern in blacklist"
                        });
                    }

                    return(new CrawlDecision {
                        Allow = true
                    });
                }
            };

            Crawler.PageCrawlCompleted += PageCrawlCompleted;
        }
예제 #3
0
        public void Crawl(CrawlRequest request)
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 10;
            crawlConfig.MaxPagesToCrawl      = 1000;
            crawlConfig.UserAgentString      = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v1.0 http://code.google.com/p/abot)";
            crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111");
            crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222");
            crawlConfig.MaxCrawlDepth            = 10;
            crawlConfig.DownloadableContentTypes = "text/html, text/plain";

            //Will use the manually created crawlConfig object created above
            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri(request.EntryURL));

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
예제 #4
0
        public ProxyPageRequester(HttpClientHandler torHandler, CrawlConfiguration config, IWebContentExtractor contentExtractor = null, HttpClient httpClient = null) : base(config, contentExtractor, httpClient)
        {
            _config           = config;
            _contentExtractor = contentExtractor;

            _torHandler = torHandler;
        }
 private void Map(CrawlBehaviorElement src, CrawlConfiguration dest)
 {
     dest.MaxConcurrentThreads                       = src.MaxConcurrentThreads;
     dest.MaxPagesToCrawl                            = src.MaxPagesToCrawl;
     dest.MaxPagesToCrawlPerDomain                   = src.MaxPagesToCrawlPerDomain;
     dest.MaxPageSizeInBytes                         = src.MaxPageSizeInBytes;
     dest.UserAgentString                            = src.UserAgentString;
     dest.HttpProtocolVersion                        = GetHttpProtocolVersion(src);
     dest.CrawlTimeoutSeconds                        = src.CrawlTimeoutSeconds;
     dest.IsUriRecrawlingEnabled                     = src.IsUriRecrawlingEnabled;
     dest.IsExternalPageCrawlingEnabled              = src.IsExternalPageCrawlingEnabled;
     dest.IsExternalPageLinksCrawlingEnabled         = src.IsExternalPageLinksCrawlingEnabled;
     dest.IsRespectUrlNamedAnchorOrHashbangEnabled   = src.IsRespectUrlNamedAnchorOrHashbangEnabled;
     dest.DownloadableContentTypes                   = src.DownloadableContentTypes;
     dest.HttpServicePointConnectionLimit            = src.HttpServicePointConnectionLimit;
     dest.HttpRequestTimeoutInSeconds                = src.HttpRequestTimeoutInSeconds;
     dest.HttpRequestMaxAutoRedirects                = src.HttpRequestMaxAutoRedirects;
     dest.IsHttpRequestAutoRedirectsEnabled          = src.IsHttpRequestAutoRedirectsEnabled;
     dest.IsHttpRequestAutomaticDecompressionEnabled = src.IsHttpRequestAutomaticDecompressionEnabled;
     dest.IsSendingCookiesEnabled                    = src.IsSendingCookiesEnabled;
     dest.IsSslCertificateValidationEnabled          = src.IsSslCertificateValidationEnabled;
     dest.MinAvailableMemoryRequiredInMb             = src.MinAvailableMemoryRequiredInMb;
     dest.MaxMemoryUsageInMb                         = src.MaxMemoryUsageInMb;
     dest.MaxMemoryUsageCacheTimeInSeconds           = src.MaxMemoryUsageCacheTimeInSeconds;
     dest.MaxCrawlDepth               = src.MaxCrawlDepth;
     dest.MaxLinksPerPage             = src.MaxLinksPerPage;
     dest.IsForcedLinkParsingEnabled  = src.IsForcedLinkParsingEnabled;
     dest.MaxRetryCount               = src.MaxRetryCount;
     dest.MinRetryDelayInMilliseconds = src.MinRetryDelayInMilliseconds;
 }
        public void Convert_CovertsFromSectionObjectToDtoObject()
        {
            CrawlConfiguration result = _config.Convert();

            Assert.IsNotNull(result);
            Assert.AreEqual(result.CrawlTimeoutSeconds, _config.CrawlBehavior.CrawlTimeoutSeconds);
            Assert.AreEqual(result.DownloadableContentTypes, _config.CrawlBehavior.DownloadableContentTypes);
            Assert.AreEqual(result.IsUriRecrawlingEnabled, _config.CrawlBehavior.IsUriRecrawlingEnabled);
            Assert.AreEqual(result.MaxConcurrentThreads, _config.CrawlBehavior.MaxConcurrentThreads);
            Assert.AreEqual(result.MaxPagesToCrawl, _config.CrawlBehavior.MaxPagesToCrawl);
            Assert.AreEqual(result.MaxPagesToCrawlPerDomain, _config.CrawlBehavior.MaxPagesToCrawlPerDomain);
            Assert.AreEqual(result.MaxPageSizeInBytes, _config.CrawlBehavior.MaxPageSizeInBytes);
            Assert.AreEqual(result.UserAgentString, _config.CrawlBehavior.UserAgentString);
            Assert.AreEqual(result.IsExternalPageCrawlingEnabled, _config.CrawlBehavior.IsExternalPageCrawlingEnabled);
            Assert.AreEqual(result.IsExternalPageLinksCrawlingEnabled, _config.CrawlBehavior.IsExternalPageLinksCrawlingEnabled);
            Assert.AreEqual(result.HttpServicePointConnectionLimit, _config.CrawlBehavior.HttpServicePointConnectionLimit);
            Assert.AreEqual(result.HttpRequestTimeoutInSeconds, _config.CrawlBehavior.HttpRequestTimeoutInSeconds);
            Assert.AreEqual(result.HttpRequestMaxAutoRedirects, _config.CrawlBehavior.HttpRequestMaxAutoRedirects);
            Assert.AreEqual(true, _config.CrawlBehavior.IsHttpRequestAutoRedirectsEnabled);
            Assert.AreEqual(true, _config.CrawlBehavior.IsHttpRequestAutomaticDecompressionEnabled);
            Assert.AreEqual(result.MinAvailableMemoryRequiredInMb, _config.CrawlBehavior.MinAvailableMemoryRequiredInMb);
            Assert.AreEqual(result.MaxMemoryUsageInMb, _config.CrawlBehavior.MaxMemoryUsageInMb);
            Assert.AreEqual(result.MaxMemoryUsageCacheTimeInSeconds, _config.CrawlBehavior.MaxMemoryUsageCacheTimeInSeconds);
            Assert.AreEqual(result.MaxCrawlDepth, _config.CrawlBehavior.MaxCrawlDepth);
            Assert.AreEqual(result.IsForcedLinkParsingEnabled, _config.CrawlBehavior.IsForcedLinkParsingEnabled);

            Assert.AreEqual(result.IsRespectRobotsDotTextEnabled, _config.Politeness.IsRespectRobotsDotTextEnabled);
            Assert.AreEqual(result.RobotsDotTextUserAgentString, _config.Politeness.RobotsDotTextUserAgentString);
            Assert.AreEqual(result.MinCrawlDelayPerDomainMilliSeconds, _config.Politeness.MinCrawlDelayPerDomainMilliSeconds);
            Assert.AreEqual(result.MaxRobotsDotTextCrawlDelayInSeconds, _config.Politeness.MaxRobotsDotTextCrawlDelayInSeconds);

            Assert.IsNotNull(result.ConfigurationExtensions);
            Assert.AreEqual(result.ConfigurationExtensions["key1"], _config.ExtensionValues[0].Value);
            Assert.AreEqual(result.ConfigurationExtensions["key2"], _config.ExtensionValues[1].Value);
        }
예제 #7
0
        public void Crawl_IsRateLimited()
        {
            new PageRequester(new CrawlConfiguration {
                UserAgentString = "aaa"
            }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();

            configuration.MaxPagesToCrawl = 3;
            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds

            int pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration);

            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            var uriToCrawl = new Uri("http://localhost.fiddler:1111/");
            var start      = DateTime.Now;

            crawler.Crawl(uriToCrawl);
            var elapsed = DateTime.Now - start;

            Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000);
            Assert.AreEqual(3, pagesCrawledCount);
        }
 private void Map(AuthorizationElement src, CrawlConfiguration dest)
 {
     dest.IsAlwaysLogin         = src.IsAlwaysLogin;
     dest.LoginUser             = src.LoginUser;
     dest.LoginPassword         = src.LoginPassword;
     dest.UseDefaultCredentials = src.UseDefaultCredentials;
 }
예제 #9
0
        /// <summary>
        /// Initializes the crawler from configuration and stores a definition of the instance
        /// </summary>
        /// <param name="seedUrl"></param>
        /// <param name="sessionId"></param>
        /// <param name="crawlerId"></param>
        public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId)
        {
            var config      = new CrawlConfiguration();
            var abotSection = AbotConfigurationSectionHandler.LoadFromXml();

            if (abotSection != null)
            {
                config = abotSection.Convert();
                _logger.InfoFormat("CrawlConfiguration loaded from app.config");
            }
            else
            {
                config.CrawlTimeoutSeconds                = 100;
                config.MaxConcurrentThreads               = 1;
                config.MaxPagesToCrawl                    = long.MaxValue;
                config.IsExternalPageCrawlingEnabled      = false;
                config.IsExternalPageLinksCrawlingEnabled = false;
                config.MinCrawlDelayPerDomainMilliSeconds = 10000;
                config.DownloadableContentTypes           = "text/html, text/plain";
                config.IsHttpRequestAutoRedirectsEnabled  = true;
                config.IsUriRecrawlingEnabled             = false;
                config.UserAgentString                    = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0";
                _logger.InfoFormat("CrawlConfiguration default loaded");
            }

            return(InitializeCrawler(seedUrl, sessionId, crawlerId, config));
        }
예제 #10
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="pageRequester">Makes the raw http requests</param>
        /// <param name="htmlParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        /// <param name="memoryManager">Checks the memory usage of the host process</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester pageRequester,
            IHtmlParser htmlParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext
            {
                CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration()
            };
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager      = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
            _scheduler          = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
            _pageRequester      = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor());
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 ||
                _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
            {
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));
            }

            _htmlParser = htmlParser ?? new AngleSharpHyperlinkParser(_crawlContext.CrawlConfiguration, null);

            _crawlContext.Scheduler = _scheduler;
        }
        public void Constructor_ValidUri_CreatesInstance()
        {
            CrawlConfiguration unitUnderTest = new CrawlConfiguration();

            Assert.IsNotNull(unitUnderTest.ConfigurationExtensions);
            Assert.AreEqual(0, unitUnderTest.ConfigurationExtensions.Count);
            Assert.AreEqual(0, unitUnderTest.CrawlTimeoutSeconds);
            Assert.AreEqual("text/html", unitUnderTest.DownloadableContentTypes);
            Assert.AreEqual(false, unitUnderTest.IsExternalPageCrawlingEnabled);
            Assert.AreEqual(false, unitUnderTest.IsExternalPageLinksCrawlingEnabled);
            Assert.AreEqual(false, unitUnderTest.IsRespectRobotsDotTextEnabled);
            Assert.AreEqual(false, unitUnderTest.IsRespectMetaRobotsNoFollowEnabled);
            Assert.AreEqual(false, unitUnderTest.IsRespectAnchorRelNoFollowEnabled);
            Assert.AreEqual(false, unitUnderTest.IsUriRecrawlingEnabled);
            Assert.AreEqual(10, unitUnderTest.MaxConcurrentThreads);
            Assert.AreEqual(5, unitUnderTest.MaxRobotsDotTextCrawlDelayInSeconds);
            Assert.AreEqual(1000, unitUnderTest.MaxPagesToCrawl);
            Assert.AreEqual(0, unitUnderTest.MaxPagesToCrawlPerDomain);
            Assert.AreEqual(0, unitUnderTest.MinCrawlDelayPerDomainMilliSeconds);
            Assert.AreEqual("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v@ABOTASSEMBLYVERSION@ http://code.google.com/p/abot)", unitUnderTest.UserAgentString);
            Assert.AreEqual("abot", unitUnderTest.RobotsDotTextUserAgentString);
            Assert.AreEqual(0, unitUnderTest.MaxPageSizeInBytes);
            Assert.AreEqual(0, unitUnderTest.HttpServicePointConnectionLimit);
            Assert.AreEqual(0, unitUnderTest.HttpRequestTimeoutInSeconds);
            Assert.AreEqual(7, unitUnderTest.HttpRequestMaxAutoRedirects);
            Assert.AreEqual(true, unitUnderTest.IsHttpRequestAutoRedirectsEnabled);
            Assert.AreEqual(false, unitUnderTest.IsHttpRequestAutomaticDecompressionEnabled);
            Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageCacheTimeInSeconds);
            Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageInMb);
            Assert.AreEqual(0, unitUnderTest.MinAvailableMemoryRequiredInMb);
            Assert.AreEqual(100, unitUnderTest.MaxCrawlDepth);
            Assert.AreEqual(false, unitUnderTest.IsForcedLinkParsingEnabled);
        }
예제 #12
0
파일: WebCrawler.cs 프로젝트: yhtsnda/abot
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="pageRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        /// <param name="memoryManager">Checks the memory usage of the host process</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester pageRequester,
            IHyperLinkParser hyperLinkParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager      = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
            _scheduler          = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
            _pageRequester      = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 ||
                _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
            {
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));
            }

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled);

            _crawlContext.Scheduler = _scheduler;
        }
예제 #13
0
        static void Main(string[] args)
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 1;
            crawlConfig.MaxPagesToCrawl      = 1;


            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig);


            crawler.PageCrawlStartingAsync  += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            //crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri("http://www.kmhk.kmu.edu.tw/news/list.asp?P_classify=9")); //This is synchronous, it will not go to the next line until the crawl has completed

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
 /// <summary>
 /// Map Extension Value element to config
 /// </summary>
 /// <param name="destination"></param>
 /// <param name="source"></param>
 public static void ImportExtensionValueCollection(this CrawlConfiguration destination, ExtensionValueCollection source)
 {
     foreach (ExtensionValueElement element in source)
     {
         destination.ConfigurationExtensions.Add(element.Key, element.Value);
     }
 }
예제 #15
0
        private static IWebCrawler GetManuallyConfiguredWebCrawler()
        {
            //Create a config object manually
            CrawlConfiguration config = new CrawlConfiguration();

            config.CrawlTimeoutSeconds                = 0;
            config.DownloadableContentTypes           = "text/html, text/plain";
            config.IsExternalPageCrawlingEnabled      = false;
            config.IsExternalPageLinksCrawlingEnabled = false;
            config.IsRespectRobotsDotTextEnabled      = false;
            config.IsUriRecrawlingEnabled             = false;
            config.MaxConcurrentThreads               = 100;
            config.MaxPagesToCrawl                    = 50;
            config.MaxPagesToCrawlPerDomain           = 0;
            config.MinCrawlDelayPerDomainMilliSeconds = 1000;

            //Add you own values without modifying Abot's source code.
            //These are accessible in CrawlContext.CrawlConfuration.ConfigurationException object throughout the crawl
            config.ConfigurationExtensions.Add("Somekey1", "SomeValue1");
            config.ConfigurationExtensions.Add("Somekey2", "SomeValue2");

            //Initialize the crawler with custom configuration created above.
            //This override the app.config file values
            return(new PoliteWebCrawler(config, null, null, null, null, null, null, null, null));
        }
예제 #16
0
        static WebCheckers()
        {
            //Create a config object manually
            CrawlConfiguration config = new CrawlConfiguration();

            config.CrawlTimeoutSeconds                = 0;
            config.DownloadableContentTypes           = "text/html, text/plain";
            config.IsExternalPageCrawlingEnabled      = false;
            config.IsExternalPageLinksCrawlingEnabled = false;
            config.IsRespectRobotsDotTextEnabled      = false;
            config.IsUriRecrawlingEnabled             = false;
            config.MaxConcurrentThreads               = 1;
            config.MaxPagesToCrawl                    = 3000;
            config.MaxPagesToCrawlPerDomain           = 0;
            config.MinCrawlDelayPerDomainMilliSeconds = 1000;
            config.HttpRequestTimeoutInSeconds        = 60;

            //Add you own values without modifying Abot's source code.
            //These are accessible in CrawlContext.CrawlConfuration.ConfigurationException object throughout the crawl
            config.ConfigurationExtensions.Add("KeywordExternalLink", "ExternalLink");
            config.ConfigurationExtensions.Add("KeywordID", "ID");
            config.ConfigurationExtensions.Add("BaseAddress", "http://officedevcentersite-devx.azurewebsites.net/");
            //config.ConfigurationExtensions.Add("IngoreUrlType", "htm");
            _config = GetCrawlConfigurationFromConfigFile() ?? config;
        }
예제 #17
0
        public async Task Crawl_MaxPagesTo25_OnlyCrawls25Pages()
        {
            await new PageRequester(new CrawlConfiguration {
                UserAgentString = "aaa"
            }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();

            configuration.MaxPagesToCrawl = 25;
            configuration.IsExternalPageCrawlingEnabled      = true;
            configuration.IsExternalPageLinksCrawlingEnabled = true;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += (a, b) =>
            {
                pagesCrawledCount++;
            };

            var res = await crawler.CrawlAsync(new Uri("http://localhost:1111/"));

            Assert.AreEqual(25, pagesCrawledCount);
        }
예제 #18
0
        public AbotExample1()
        {
            var crawlConfig = new CrawlConfiguration();

            _crawler = new EasyWebCrawler(crawlConfig);
            _crawler.CrawlBag.MyFoo1                = new Foo();
            _crawler.CrawlBag.MyFoo2                = new Foo();
            _crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            _crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            _crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            //_crawler.ShouldCrawlPage((crawl, context) =>
            //{
            //    var decision = new CrawlDecision()
            //    {
            //        Allow = false,
            //    };
            //    return decision;
            //});
            _crawler.ShouldDownloadPageContent((page, context) =>
            {
                var link = page.Uri;
                Console.WriteLine(" --> detected link : {0}", link);
                return(new CrawlDecision()
                {
                    Allow = false
                });
            });
        }
        public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_IsRetry_ReturnsTrue()
        {
            Uri uri = new Uri("http://a.com/");
            CrawlConfiguration config = new CrawlConfiguration
            {
                MaxPagesToCrawlPerDomain = 100
            };
            ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>();

            countByDomain.TryAdd(uri.Authority, 100);
            CrawlContext crawlContext = new CrawlContext
            {
                CrawlConfiguration = config,
                CrawlStartDate     = DateTime.Now,
                CrawlCountByDomain = countByDomain
            };

            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(
                new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage"))
            {
                IsRetry    = true,
                IsInternal = true
            },
                crawlContext);

            Assert.IsTrue(result.Allow);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
예제 #20
0
파일: WebCrawler.cs 프로젝트: dagstuan/abot
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="httpRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester httpRequester,
            IHyperLinkParser hyperLinkParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager      = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads);
            _scheduler          = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled);
            _httpRequester      = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 ||
                _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
            {
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));
            }

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser();

            _crawlContext.Scheduler = _scheduler;
        }
 public void ConfigureCrawler(int maxPagesToCrawl, int timeoutSeconds, int maxConcurrentThreads)
 {
     _crawlConfiguration = new CrawlConfiguration();
     _crawlConfiguration.CrawlTimeoutSeconds  = timeoutSeconds;
     _crawlConfiguration.MaxConcurrentThreads = maxConcurrentThreads;
     _crawlConfiguration.MaxPagesToCrawl      = maxPagesToCrawl;
 }
예제 #22
0
        public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_ReturnsFalse()
        {
            Uri uri = new Uri("http://a.com/");
            CrawlConfiguration config = new CrawlConfiguration
            {
                MaxPagesToCrawlPerDomain = 100
            };
            ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>();

            countByDomain.TryAdd(uri.Authority, 100);
            CrawlContext crawlContext = new CrawlContext
            {
                CrawlConfiguration = config,
                CrawlStartDate     = DateTime.Now,
                CrawlCountByDomain = countByDomain
            };

            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(
                new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage"))
            {
                IsInternal = true
            },
                crawlContext);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("MaxPagesToCrawlPerDomain limit of [100] has been reached for domain [a.com]", result.Reason);
            Assert.IsFalse(crawlContext.IsCrawlStopRequested);
        }
예제 #23
0
        static void Main(string[] args)
        {
            CrawlConfiguration config = new CrawlConfiguration();

            config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe.

            // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe.
            // Make sure to dispose this instance or you will have a zombie process!
            IWebDriver driver = CreatePhantomJsDriver(config);

            // Create the content extractor that uses PhantomJS.
            IWebContentExtractor extractor = new JavaScriptContentExtractor(driver);

            // Create a PageRequester that will use the extractor.
            IPageRequester requester = new PageRequester(config, extractor);

            using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) {
                crawler.PageCrawlCompleted += OnPageCrawlCompleted;

                CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/"));
                if (result.ErrorOccurred)
                {
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                }
                else
                {
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                }
            }

            Console.Read();
        }
예제 #24
0
        private static async Task DemoSimpleCrawler()
        {
            var config = new CrawlConfiguration
            {
                UserAgentString = "2019RLCrawlAThon",
                MaxPagesToCrawl = 0,
                MinCrawlDelayPerDomainMilliSeconds = 10,
            };
            var start   = new Uri("https://thailand.kyocera.com/");
            var crawler = new PoliteWebCrawler(
                config,
                new BetterDecisionMaker(start),
                null,
                new Scheduler(false, null, new PriorityUriRepository()),
                null,
                null,
                null,
                null,
                null);

            var files    = new HashSet <string>();
            var decMaker = new CrawlDecisionMaker();
            var batch    = new HashSet <string>();

            crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted;
            crawler.PageCrawlCompleted += (sender, e) =>
            {
                if (new[] { ".exe", ".zip", ".tar" }.Any(c => e.CrawledPage.Uri.AbsolutePath.Contains(c)))
                {
                    lock (files)
                    {
                        Console.WriteLine("Found file: " + e.CrawledPage.Uri.Host + e.CrawledPage.Uri.LocalPath);
                        Console.WriteLine(e.CrawledPage.CrawlDepth);
                        if (!files.Contains(e.CrawledPage.Uri.ToString()))
                        {
                            files.Add(e.CrawledPage.Uri.ToString());
                            batch.Add(e.CrawledPage.Uri.ToString());
                            if (batch.Count >= 10)
                            {
                                using (var httpClient = new HttpClient())
                                {
                                    using (var request = new HttpRequestMessage(new HttpMethod("POST"), "http://hackathon.reversinglabs.com/api/test/bulk"))
                                    {
                                        var base64authorization = Convert.ToBase64String(Encoding.ASCII.GetBytes("tztok_jadnici:7@dQ6dqq7YZggcd"));
                                        request.Headers.TryAddWithoutValidation("Authorization", $"Basic {base64authorization}");

                                        var body = "{\"crawlathon\": {\"query\": {\"site\": \"filehippo\", \"links\":[" + string.Join(", ", batch.Select(s => "\"" + s + "\"")) + "]}}}";
                                        request.Content = new StringContent(body, Encoding.UTF8, "application/json");
                                        var resp = httpClient.SendAsync(request).Result;
                                        batch.Clear();
                                    }
                                }
                            }
                        }
                    }
                }
            };
            var crawlResult = await crawler.CrawlAsync(start);
        }
예제 #25
0
        public void Test(Uri uri)
        {
            pageCount = 0;
            baseUri   = uri;
            string message;

            CrawlConfiguration crawlConfiguration = new CrawlConfiguration();

            crawlConfiguration.MaxConcurrentThreads = 4;
            crawlConfiguration.UserAgentString      =
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
                "AppleWebKit/537.36 (KHTML, like Gecko) " +
                "Chrome/60.0.3112.113 Safari/537.36 bot";
            crawlConfiguration.MaxPagesToCrawl          = 10000;
            crawlConfiguration.DownloadableContentTypes =
                "text/html, text/plain, image/jpeg, image/pjpeg, image/png";
            crawlConfiguration.CrawlTimeoutSeconds = 100;
            crawlConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1000;

            using PoliteWebCrawler crawler =
                      new PoliteWebCrawler(crawlConfiguration);

            crawler.PageCrawlStarting  += ProcessPageCrawlStarted;
            crawler.PageCrawlCompleted += ProcessPageCrawlCompleted;

            CrawlResult result = crawler.CrawlAsync(baseUri).Result;

            if (result.ErrorOccurred)
            {
                message = StringTable.GetString(
                    "CRAWL_COMPLETE_ERROR",
                    CultureInfo.InstalledUICulture);

                Log.InfoFormat(
                    CultureInfo.InvariantCulture,
                    message,
                    result.RootUri.AbsoluteUri,
                    result.ErrorException.Message);
            }
            else
            {
                message = StringTable.GetString(
                    "CRAWL_COMPLETE_NO_ERROR",
                    CultureInfo.InstalledUICulture);

                Log.InfoFormat(
                    CultureInfo.InvariantCulture,
                    message,
                    result.RootUri.AbsoluteUri);
            }

            message = StringTable.GetString(
                "TOTAL_PAGES",
                CultureInfo.InstalledUICulture);
            Log.InfoFormat(
                CultureInfo.InvariantCulture,
                message,
                pageCount.ToString(CultureInfo.InvariantCulture));
        }
예제 #26
0
        /// <summary>
        /// Decides whether the page's content should be dowloaded
        /// </summary>
        /// <param name="crawledPage">Page for crawling</param>
        /// <param name="crawlContext">Collect all settings for crawl</param>
        /// <returns>Decision that should crawl or not</returns>
        public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Null crawled page"
                }
            }
            ;

            if (crawlContext == null)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Null crawl context"
                }
            }
            ;

            if (crawledPage.HttpWebResponse == null)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Null HttpWebResponse"
                }
            }
            ;

            if (crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                return new CrawlDecision {
                           Allow = false, Reason = $"Status code {crawledPage.HttpWebResponse.StatusCode}"
                }
            }
            ;

            if (!IsDownloadableByContentType(crawledPage, crawlContext, out List <string> cleanDownloadableContentTypes))
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes)
                }
            }
            ;

            if (CrawlConfiguration.IsPayAttention(crawlContext.CrawlConfiguration.MaxPageSizeInBytes) &&
                crawledPage.HttpWebResponse.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes)
            {
                return new CrawlDecision
                       {
                           Allow  = false,
                           Reason =
                               $"Page size of [{crawledPage.HttpWebResponse.ContentLength}] bytes is above the max allowable of " +
                               $"[{crawlContext.CrawlConfiguration.MaxPageSizeInBytes}] bytes"
                       }
            }
            ;

            return(new CrawlDecision {
                Allow = true
            });
        }
 private void Map(AuthorizationElement src, CrawlConfiguration dest)
 {
     dest.IsAlwaysLogin = src.IsAlwaysLogin;
     dest.LoginUser     = src.LoginUser;
     dest.LoginPassword = src.LoginPassword;
     dest.LoginDomain   = src.LoginDomain;
     dest.IsNTLM        = src.IsNTLM;
 }
예제 #28
0
 public async Task Crawl_VerifyCrawlResultIsAsExpected()
 {
     var config = new CrawlConfiguration()
     {
         IsExternalPageCrawlingEnabled = true
     };
     await base.CrawlAndAssert(new PoliteWebCrawler(config));
 }
예제 #29
0
 public PageRequester(ILogger <PageRequester> logger, IHttpClientFactory httpClientFactory, CrawlConfiguration crawlConfiguration, IWebContentExtractor webContentExtractor)
 {
     _logger              = logger;
     _httpFactory         = httpClientFactory;
     _client              = _httpFactory.CreateClient();
     _client.Timeout      = TimeSpan.FromMinutes(10);
     _config              = crawlConfiguration;
     _webContentExtractor = webContentExtractor;
 }
예제 #30
0
 public void Dispose()
 {
     if (_extractor != null)
     {
         _extractor.Dispose();
     }
     _cookieContainer = null;
     _config          = null;
 }
예제 #31
0
        public void ChangeMaxVisits_To10_IsChangedTo10()
        {
            // Arrange:
            var url = new Uri("http://www.uncas.dk");
            const int NewMaxVisits = 117;
            var crawlConfiguration =
                new CrawlConfiguration(
                    url,
                    NewMaxVisits);

            // Assert:
            Assert.AreEqual(
                NewMaxVisits,
                crawlConfiguration.MaxVisits);
        }
예제 #32
0
        public void AddMatches_EmptyList_NoAdditional()
        {
            // Arrange:
            var url = new Uri("http://www.uncas.dk");
            var crawlConfiguration =
                new CrawlConfiguration(
                    url,
                    10);

            // Act:
            crawlConfiguration.AddMatches(null);

            // Assert:
            Assert.That(
                crawlConfiguration.MatchPatterns.Count(),
                Is.EqualTo(1));
        }
예제 #33
0
        public void AddMatches_TwoItemsInList_TwoAdditional()
        {
            // Arrange:
            var url = new Uri("http://www.uncas.dk");
            var crawlConfiguration =
                new CrawlConfiguration(
                    url,
                    10);

            // Act:
            crawlConfiguration.AddMatches(new string[] { "x", "y" });

            // Assert:
            Assert.That(
                crawlConfiguration.MatchPatterns.Count(),
                Is.EqualTo(3));
        }
예제 #34
0
        public void CrawlConfiguration_WithPatterns_ListIsPopulated()
        {
            // Arrange:
            var url = new Uri("http://www.uncas.dk");
            var patterns = new string[] { "x", "y" };

            // Act:
            var crawlConfiguration =
                new CrawlConfiguration(
                    url,
                    10,
                    patterns);

            // Assert:
            Assert.That(
                crawlConfiguration.MatchPatterns.Count(),
                Is.EqualTo(3));
        }
예제 #35
0
        /// <summary>
        /// Parses the command line arguments.
        /// </summary>
        /// <param name="args">The command line arguments.</param>
        /// <returns>The crawl configuration.</returns>
        /// <remarks>
        /// Command line arguments, with default values:
        /// -url http://localhost -maxPages 10.
        /// </remarks>
        public static ICrawlConfiguration ParseArguments(
            IList<string> args)
        {
            string url = GetStartUrl(args);
            int? maxPages = GetMaxPages(args);

            var result = new CrawlConfiguration(
                new Uri(url),
                maxPages);
            string matches = CombinationParser.GetValue(
                args, "matches", "matches", string.Empty);
            if (!string.IsNullOrEmpty(matches))
            {
                string[] matchList =
                    matches.Split(
                    new char[] { ',' },
                    StringSplitOptions.RemoveEmptyEntries);
                result.AddMatches(matchList);
            }

            return result;
        }
예제 #36
0
        public void ToString_WithMatchPatterns_Ok()
        {
            // Arrange:
            var url = new Uri("http://www.uncas.dk");
            var crawlConfiguration =
                new CrawlConfiguration(
                    url,
                    10);

            // Act:
            string result = crawlConfiguration.ToString();

            // Assert:
            Assert.That(
                result,
                Is.StringContaining("http://www.uncas.dk"));
        }
예제 #37
0
        public void CrawlConfiguration_WithStartUrls_ListIsPopulated()
        {
            // Arrange:
            var url = new Uri("http://www.uncas.dk");
            var url2 = new Uri("http://www2.uncas.dk");
            var urls = new Uri[] { url, url2 };

            // Act:
            var crawlConfiguration =
                new CrawlConfiguration(
                    urls, 10);

            // Assert:
            Assert.That(
                crawlConfiguration.MatchPatterns.Count(),
                Is.EqualTo(2));
        }