static void DoCrawl() { CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxPagesToCrawl = 5000; crawlConfig.UserAgentString = "abot v1.0 http://code.google.com/p/abot"; //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111"); //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222"); //Will use app.config for confguration PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri("http://sunnah.com/")); Console.WriteLine("jumlah crawled content :" + result.CrawlContext.CrawledCount); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
public WebsiteIndexer(string host, ICollection <string> ignoredPathes = null, int delayPerRequestMilliSeconds = 1000, int maxPagesToCrawl = 1000) { _host = host; var config = new CrawlConfiguration { MaxPagesToCrawl = maxPagesToCrawl, MinCrawlDelayPerDomainMilliSeconds = delayPerRequestMilliSeconds, IsExternalPageCrawlingEnabled = false }; Crawler = new PoliteWebCrawler(config) { ShouldCrawlPageDecisionMaker = (pageToCrawl, crawlContext) => { var ignored = string.IsNullOrEmpty(pageToCrawl.Uri?.AbsolutePath) || ignoredPathes?.Any(p => Regex.IsMatch(pageToCrawl.Uri.AbsolutePath, p)) == true; if (ignored) { Console.WriteLine($"Ignored '{pageToCrawl.Uri?.AbsolutePath}'"); return(new CrawlDecision { Allow = false, Reason = "Path matches pattern in blacklist" }); } return(new CrawlDecision { Allow = true }); } }; Crawler.PageCrawlCompleted += PageCrawlCompleted; }
public void Crawl(CrawlRequest request) { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxPagesToCrawl = 1000; crawlConfig.UserAgentString = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v1.0 http://code.google.com/p/abot)"; crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111"); crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222"); crawlConfig.MaxCrawlDepth = 10; crawlConfig.DownloadableContentTypes = "text/html, text/plain"; //Will use the manually created crawlConfig object created above PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri(request.EntryURL)); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
public ProxyPageRequester(HttpClientHandler torHandler, CrawlConfiguration config, IWebContentExtractor contentExtractor = null, HttpClient httpClient = null) : base(config, contentExtractor, httpClient) { _config = config; _contentExtractor = contentExtractor; _torHandler = torHandler; }
private void Map(CrawlBehaviorElement src, CrawlConfiguration dest) { dest.MaxConcurrentThreads = src.MaxConcurrentThreads; dest.MaxPagesToCrawl = src.MaxPagesToCrawl; dest.MaxPagesToCrawlPerDomain = src.MaxPagesToCrawlPerDomain; dest.MaxPageSizeInBytes = src.MaxPageSizeInBytes; dest.UserAgentString = src.UserAgentString; dest.HttpProtocolVersion = GetHttpProtocolVersion(src); dest.CrawlTimeoutSeconds = src.CrawlTimeoutSeconds; dest.IsUriRecrawlingEnabled = src.IsUriRecrawlingEnabled; dest.IsExternalPageCrawlingEnabled = src.IsExternalPageCrawlingEnabled; dest.IsExternalPageLinksCrawlingEnabled = src.IsExternalPageLinksCrawlingEnabled; dest.IsRespectUrlNamedAnchorOrHashbangEnabled = src.IsRespectUrlNamedAnchorOrHashbangEnabled; dest.DownloadableContentTypes = src.DownloadableContentTypes; dest.HttpServicePointConnectionLimit = src.HttpServicePointConnectionLimit; dest.HttpRequestTimeoutInSeconds = src.HttpRequestTimeoutInSeconds; dest.HttpRequestMaxAutoRedirects = src.HttpRequestMaxAutoRedirects; dest.IsHttpRequestAutoRedirectsEnabled = src.IsHttpRequestAutoRedirectsEnabled; dest.IsHttpRequestAutomaticDecompressionEnabled = src.IsHttpRequestAutomaticDecompressionEnabled; dest.IsSendingCookiesEnabled = src.IsSendingCookiesEnabled; dest.IsSslCertificateValidationEnabled = src.IsSslCertificateValidationEnabled; dest.MinAvailableMemoryRequiredInMb = src.MinAvailableMemoryRequiredInMb; dest.MaxMemoryUsageInMb = src.MaxMemoryUsageInMb; dest.MaxMemoryUsageCacheTimeInSeconds = src.MaxMemoryUsageCacheTimeInSeconds; dest.MaxCrawlDepth = src.MaxCrawlDepth; dest.MaxLinksPerPage = src.MaxLinksPerPage; dest.IsForcedLinkParsingEnabled = src.IsForcedLinkParsingEnabled; dest.MaxRetryCount = src.MaxRetryCount; dest.MinRetryDelayInMilliseconds = src.MinRetryDelayInMilliseconds; }
public void Convert_CovertsFromSectionObjectToDtoObject() { CrawlConfiguration result = _config.Convert(); Assert.IsNotNull(result); Assert.AreEqual(result.CrawlTimeoutSeconds, _config.CrawlBehavior.CrawlTimeoutSeconds); Assert.AreEqual(result.DownloadableContentTypes, _config.CrawlBehavior.DownloadableContentTypes); Assert.AreEqual(result.IsUriRecrawlingEnabled, _config.CrawlBehavior.IsUriRecrawlingEnabled); Assert.AreEqual(result.MaxConcurrentThreads, _config.CrawlBehavior.MaxConcurrentThreads); Assert.AreEqual(result.MaxPagesToCrawl, _config.CrawlBehavior.MaxPagesToCrawl); Assert.AreEqual(result.MaxPagesToCrawlPerDomain, _config.CrawlBehavior.MaxPagesToCrawlPerDomain); Assert.AreEqual(result.MaxPageSizeInBytes, _config.CrawlBehavior.MaxPageSizeInBytes); Assert.AreEqual(result.UserAgentString, _config.CrawlBehavior.UserAgentString); Assert.AreEqual(result.IsExternalPageCrawlingEnabled, _config.CrawlBehavior.IsExternalPageCrawlingEnabled); Assert.AreEqual(result.IsExternalPageLinksCrawlingEnabled, _config.CrawlBehavior.IsExternalPageLinksCrawlingEnabled); Assert.AreEqual(result.HttpServicePointConnectionLimit, _config.CrawlBehavior.HttpServicePointConnectionLimit); Assert.AreEqual(result.HttpRequestTimeoutInSeconds, _config.CrawlBehavior.HttpRequestTimeoutInSeconds); Assert.AreEqual(result.HttpRequestMaxAutoRedirects, _config.CrawlBehavior.HttpRequestMaxAutoRedirects); Assert.AreEqual(true, _config.CrawlBehavior.IsHttpRequestAutoRedirectsEnabled); Assert.AreEqual(true, _config.CrawlBehavior.IsHttpRequestAutomaticDecompressionEnabled); Assert.AreEqual(result.MinAvailableMemoryRequiredInMb, _config.CrawlBehavior.MinAvailableMemoryRequiredInMb); Assert.AreEqual(result.MaxMemoryUsageInMb, _config.CrawlBehavior.MaxMemoryUsageInMb); Assert.AreEqual(result.MaxMemoryUsageCacheTimeInSeconds, _config.CrawlBehavior.MaxMemoryUsageCacheTimeInSeconds); Assert.AreEqual(result.MaxCrawlDepth, _config.CrawlBehavior.MaxCrawlDepth); Assert.AreEqual(result.IsForcedLinkParsingEnabled, _config.CrawlBehavior.IsForcedLinkParsingEnabled); Assert.AreEqual(result.IsRespectRobotsDotTextEnabled, _config.Politeness.IsRespectRobotsDotTextEnabled); Assert.AreEqual(result.RobotsDotTextUserAgentString, _config.Politeness.RobotsDotTextUserAgentString); Assert.AreEqual(result.MinCrawlDelayPerDomainMilliSeconds, _config.Politeness.MinCrawlDelayPerDomainMilliSeconds); Assert.AreEqual(result.MaxRobotsDotTextCrawlDelayInSeconds, _config.Politeness.MaxRobotsDotTextCrawlDelayInSeconds); Assert.IsNotNull(result.ConfigurationExtensions); Assert.AreEqual(result.ConfigurationExtensions["key1"], _config.ExtensionValues[0].Value); Assert.AreEqual(result.ConfigurationExtensions["key2"], _config.ExtensionValues[1].Value); }
public void Crawl_IsRateLimited() { new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 3; configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds int pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; var uriToCrawl = new Uri("http://localhost.fiddler:1111/"); var start = DateTime.Now; crawler.Crawl(uriToCrawl); var elapsed = DateTime.Now - start; Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000); Assert.AreEqual(3, pagesCrawledCount); }
private void Map(AuthorizationElement src, CrawlConfiguration dest) { dest.IsAlwaysLogin = src.IsAlwaysLogin; dest.LoginUser = src.LoginUser; dest.LoginPassword = src.LoginPassword; dest.UseDefaultCredentials = src.UseDefaultCredentials; }
/// <summary> /// Initializes the crawler from configuration and stores a definition of the instance /// </summary> /// <param name="seedUrl"></param> /// <param name="sessionId"></param> /// <param name="crawlerId"></param> public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId) { var config = new CrawlConfiguration(); var abotSection = AbotConfigurationSectionHandler.LoadFromXml(); if (abotSection != null) { config = abotSection.Convert(); _logger.InfoFormat("CrawlConfiguration loaded from app.config"); } else { config.CrawlTimeoutSeconds = 100; config.MaxConcurrentThreads = 1; config.MaxPagesToCrawl = long.MaxValue; config.IsExternalPageCrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled = false; config.MinCrawlDelayPerDomainMilliSeconds = 10000; config.DownloadableContentTypes = "text/html, text/plain"; config.IsHttpRequestAutoRedirectsEnabled = true; config.IsUriRecrawlingEnabled = false; config.UserAgentString = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0"; _logger.InfoFormat("CrawlConfiguration default loaded"); } return(InitializeCrawler(seedUrl, sessionId, crawlerId, config)); }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="htmlParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHtmlParser htmlParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext { CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration() }; CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor()); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _htmlParser = htmlParser ?? new AngleSharpHyperlinkParser(_crawlContext.CrawlConfiguration, null); _crawlContext.Scheduler = _scheduler; }
public void Constructor_ValidUri_CreatesInstance() { CrawlConfiguration unitUnderTest = new CrawlConfiguration(); Assert.IsNotNull(unitUnderTest.ConfigurationExtensions); Assert.AreEqual(0, unitUnderTest.ConfigurationExtensions.Count); Assert.AreEqual(0, unitUnderTest.CrawlTimeoutSeconds); Assert.AreEqual("text/html", unitUnderTest.DownloadableContentTypes); Assert.AreEqual(false, unitUnderTest.IsExternalPageCrawlingEnabled); Assert.AreEqual(false, unitUnderTest.IsExternalPageLinksCrawlingEnabled); Assert.AreEqual(false, unitUnderTest.IsRespectRobotsDotTextEnabled); Assert.AreEqual(false, unitUnderTest.IsRespectMetaRobotsNoFollowEnabled); Assert.AreEqual(false, unitUnderTest.IsRespectAnchorRelNoFollowEnabled); Assert.AreEqual(false, unitUnderTest.IsUriRecrawlingEnabled); Assert.AreEqual(10, unitUnderTest.MaxConcurrentThreads); Assert.AreEqual(5, unitUnderTest.MaxRobotsDotTextCrawlDelayInSeconds); Assert.AreEqual(1000, unitUnderTest.MaxPagesToCrawl); Assert.AreEqual(0, unitUnderTest.MaxPagesToCrawlPerDomain); Assert.AreEqual(0, unitUnderTest.MinCrawlDelayPerDomainMilliSeconds); Assert.AreEqual("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v@ABOTASSEMBLYVERSION@ http://code.google.com/p/abot)", unitUnderTest.UserAgentString); Assert.AreEqual("abot", unitUnderTest.RobotsDotTextUserAgentString); Assert.AreEqual(0, unitUnderTest.MaxPageSizeInBytes); Assert.AreEqual(0, unitUnderTest.HttpServicePointConnectionLimit); Assert.AreEqual(0, unitUnderTest.HttpRequestTimeoutInSeconds); Assert.AreEqual(7, unitUnderTest.HttpRequestMaxAutoRedirects); Assert.AreEqual(true, unitUnderTest.IsHttpRequestAutoRedirectsEnabled); Assert.AreEqual(false, unitUnderTest.IsHttpRequestAutomaticDecompressionEnabled); Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageCacheTimeInSeconds); Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageInMb); Assert.AreEqual(0, unitUnderTest.MinAvailableMemoryRequiredInMb); Assert.AreEqual(100, unitUnderTest.MaxCrawlDepth); Assert.AreEqual(false, unitUnderTest.IsForcedLinkParsingEnabled); }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount); _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null); _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled); _crawlContext.Scheduler = _scheduler; }
static void Main(string[] args) { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 1; crawlConfig.MaxPagesToCrawl = 1; PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; //crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri("http://www.kmhk.kmu.edu.tw/news/list.asp?P_classify=9")); //This is synchronous, it will not go to the next line until the crawl has completed if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
/// <summary> /// Map Extension Value element to config /// </summary> /// <param name="destination"></param> /// <param name="source"></param> public static void ImportExtensionValueCollection(this CrawlConfiguration destination, ExtensionValueCollection source) { foreach (ExtensionValueElement element in source) { destination.ConfigurationExtensions.Add(element.Key, element.Value); } }
private static IWebCrawler GetManuallyConfiguredWebCrawler() { //Create a config object manually CrawlConfiguration config = new CrawlConfiguration(); config.CrawlTimeoutSeconds = 0; config.DownloadableContentTypes = "text/html, text/plain"; config.IsExternalPageCrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled = false; config.IsRespectRobotsDotTextEnabled = false; config.IsUriRecrawlingEnabled = false; config.MaxConcurrentThreads = 100; config.MaxPagesToCrawl = 50; config.MaxPagesToCrawlPerDomain = 0; config.MinCrawlDelayPerDomainMilliSeconds = 1000; //Add you own values without modifying Abot's source code. //These are accessible in CrawlContext.CrawlConfuration.ConfigurationException object throughout the crawl config.ConfigurationExtensions.Add("Somekey1", "SomeValue1"); config.ConfigurationExtensions.Add("Somekey2", "SomeValue2"); //Initialize the crawler with custom configuration created above. //This override the app.config file values return(new PoliteWebCrawler(config, null, null, null, null, null, null, null, null)); }
static WebCheckers() { //Create a config object manually CrawlConfiguration config = new CrawlConfiguration(); config.CrawlTimeoutSeconds = 0; config.DownloadableContentTypes = "text/html, text/plain"; config.IsExternalPageCrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled = false; config.IsRespectRobotsDotTextEnabled = false; config.IsUriRecrawlingEnabled = false; config.MaxConcurrentThreads = 1; config.MaxPagesToCrawl = 3000; config.MaxPagesToCrawlPerDomain = 0; config.MinCrawlDelayPerDomainMilliSeconds = 1000; config.HttpRequestTimeoutInSeconds = 60; //Add you own values without modifying Abot's source code. //These are accessible in CrawlContext.CrawlConfuration.ConfigurationException object throughout the crawl config.ConfigurationExtensions.Add("KeywordExternalLink", "ExternalLink"); config.ConfigurationExtensions.Add("KeywordID", "ID"); config.ConfigurationExtensions.Add("BaseAddress", "http://officedevcentersite-devx.azurewebsites.net/"); //config.ConfigurationExtensions.Add("IngoreUrlType", "htm"); _config = GetCrawlConfigurationFromConfigFile() ?? config; }
public async Task Crawl_MaxPagesTo25_OnlyCrawls25Pages() { await new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 25; configuration.IsExternalPageCrawlingEnabled = true; configuration.IsExternalPageLinksCrawlingEnabled = true; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompleted += (a, b) => { pagesCrawledCount++; }; var res = await crawler.CrawlAsync(new Uri("http://localhost:1111/")); Assert.AreEqual(25, pagesCrawledCount); }
public AbotExample1() { var crawlConfig = new CrawlConfiguration(); _crawler = new EasyWebCrawler(crawlConfig); _crawler.CrawlBag.MyFoo1 = new Foo(); _crawler.CrawlBag.MyFoo2 = new Foo(); _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; //_crawler.ShouldCrawlPage((crawl, context) => //{ // var decision = new CrawlDecision() // { // Allow = false, // }; // return decision; //}); _crawler.ShouldDownloadPageContent((page, context) => { var link = page.Uri; Console.WriteLine(" --> detected link : {0}", link); return(new CrawlDecision() { Allow = false }); }); }
public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_IsRetry_ReturnsTrue() { Uri uri = new Uri("http://a.com/"); CrawlConfiguration config = new CrawlConfiguration { MaxPagesToCrawlPerDomain = 100 }; ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>(); countByDomain.TryAdd(uri.Authority, 100); CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = config, CrawlStartDate = DateTime.Now, CrawlCountByDomain = countByDomain }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage")) { IsRetry = true, IsInternal = true }, crawlContext); Assert.IsTrue(result.Allow); Assert.IsFalse(result.ShouldHardStopCrawl); Assert.IsFalse(result.ShouldStopCrawl); }
/// <summary> /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor. /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="httpRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> public WebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager) { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration(); CrawlBag = _crawlContext.CrawlBag; _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads); _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled); _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration); _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker(); if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0) { _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds)); } _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(); _crawlContext.Scheduler = _scheduler; }
public void ConfigureCrawler(int maxPagesToCrawl, int timeoutSeconds, int maxConcurrentThreads) { _crawlConfiguration = new CrawlConfiguration(); _crawlConfiguration.CrawlTimeoutSeconds = timeoutSeconds; _crawlConfiguration.MaxConcurrentThreads = maxConcurrentThreads; _crawlConfiguration.MaxPagesToCrawl = maxPagesToCrawl; }
public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_ReturnsFalse() { Uri uri = new Uri("http://a.com/"); CrawlConfiguration config = new CrawlConfiguration { MaxPagesToCrawlPerDomain = 100 }; ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>(); countByDomain.TryAdd(uri.Authority, 100); CrawlContext crawlContext = new CrawlContext { CrawlConfiguration = config, CrawlStartDate = DateTime.Now, CrawlCountByDomain = countByDomain }; CrawlDecision result = _unitUnderTest.ShouldCrawlPage( new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage")) { IsInternal = true }, crawlContext); Assert.IsFalse(result.Allow); Assert.AreEqual("MaxPagesToCrawlPerDomain limit of [100] has been reached for domain [a.com]", result.Reason); Assert.IsFalse(crawlContext.IsCrawlStopRequested); }
static void Main(string[] args) { CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe. // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe. // Make sure to dispose this instance or you will have a zombie process! IWebDriver driver = CreatePhantomJsDriver(config); // Create the content extractor that uses PhantomJS. IWebContentExtractor extractor = new JavaScriptContentExtractor(driver); // Create a PageRequester that will use the extractor. IPageRequester requester = new PageRequester(config, extractor); using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) { crawler.PageCrawlCompleted += OnPageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/")); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } } Console.Read(); }
private static async Task DemoSimpleCrawler() { var config = new CrawlConfiguration { UserAgentString = "2019RLCrawlAThon", MaxPagesToCrawl = 0, MinCrawlDelayPerDomainMilliSeconds = 10, }; var start = new Uri("https://thailand.kyocera.com/"); var crawler = new PoliteWebCrawler( config, new BetterDecisionMaker(start), null, new Scheduler(false, null, new PriorityUriRepository()), null, null, null, null, null); var files = new HashSet <string>(); var decMaker = new CrawlDecisionMaker(); var batch = new HashSet <string>(); crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; crawler.PageCrawlCompleted += (sender, e) => { if (new[] { ".exe", ".zip", ".tar" }.Any(c => e.CrawledPage.Uri.AbsolutePath.Contains(c))) { lock (files) { Console.WriteLine("Found file: " + e.CrawledPage.Uri.Host + e.CrawledPage.Uri.LocalPath); Console.WriteLine(e.CrawledPage.CrawlDepth); if (!files.Contains(e.CrawledPage.Uri.ToString())) { files.Add(e.CrawledPage.Uri.ToString()); batch.Add(e.CrawledPage.Uri.ToString()); if (batch.Count >= 10) { using (var httpClient = new HttpClient()) { using (var request = new HttpRequestMessage(new HttpMethod("POST"), "http://hackathon.reversinglabs.com/api/test/bulk")) { var base64authorization = Convert.ToBase64String(Encoding.ASCII.GetBytes("tztok_jadnici:7@dQ6dqq7YZggcd")); request.Headers.TryAddWithoutValidation("Authorization", $"Basic {base64authorization}"); var body = "{\"crawlathon\": {\"query\": {\"site\": \"filehippo\", \"links\":[" + string.Join(", ", batch.Select(s => "\"" + s + "\"")) + "]}}}"; request.Content = new StringContent(body, Encoding.UTF8, "application/json"); var resp = httpClient.SendAsync(request).Result; batch.Clear(); } } } } } } }; var crawlResult = await crawler.CrawlAsync(start); }
public void Test(Uri uri) { pageCount = 0; baseUri = uri; string message; CrawlConfiguration crawlConfiguration = new CrawlConfiguration(); crawlConfiguration.MaxConcurrentThreads = 4; crawlConfiguration.UserAgentString = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/60.0.3112.113 Safari/537.36 bot"; crawlConfiguration.MaxPagesToCrawl = 10000; crawlConfiguration.DownloadableContentTypes = "text/html, text/plain, image/jpeg, image/pjpeg, image/png"; crawlConfiguration.CrawlTimeoutSeconds = 100; crawlConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1000; using PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfiguration); crawler.PageCrawlStarting += ProcessPageCrawlStarted; crawler.PageCrawlCompleted += ProcessPageCrawlCompleted; CrawlResult result = crawler.CrawlAsync(baseUri).Result; if (result.ErrorOccurred) { message = StringTable.GetString( "CRAWL_COMPLETE_ERROR", CultureInfo.InstalledUICulture); Log.InfoFormat( CultureInfo.InvariantCulture, message, result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { message = StringTable.GetString( "CRAWL_COMPLETE_NO_ERROR", CultureInfo.InstalledUICulture); Log.InfoFormat( CultureInfo.InvariantCulture, message, result.RootUri.AbsoluteUri); } message = StringTable.GetString( "TOTAL_PAGES", CultureInfo.InstalledUICulture); Log.InfoFormat( CultureInfo.InvariantCulture, message, pageCount.ToString(CultureInfo.InvariantCulture)); }
/// <summary> /// Decides whether the page's content should be dowloaded /// </summary> /// <param name="crawledPage">Page for crawling</param> /// <param name="crawlContext">Collect all settings for crawl</param> /// <returns>Decision that should crawl or not</returns> public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) { return new CrawlDecision { Allow = false, Reason = "Null crawled page" } } ; if (crawlContext == null) { return new CrawlDecision { Allow = false, Reason = "Null crawl context" } } ; if (crawledPage.HttpWebResponse == null) { return new CrawlDecision { Allow = false, Reason = "Null HttpWebResponse" } } ; if (crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { return new CrawlDecision { Allow = false, Reason = $"Status code {crawledPage.HttpWebResponse.StatusCode}" } } ; if (!IsDownloadableByContentType(crawledPage, crawlContext, out List <string> cleanDownloadableContentTypes)) { return new CrawlDecision { Allow = false, Reason = "Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes) } } ; if (CrawlConfiguration.IsPayAttention(crawlContext.CrawlConfiguration.MaxPageSizeInBytes) && crawledPage.HttpWebResponse.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes) { return new CrawlDecision { Allow = false, Reason = $"Page size of [{crawledPage.HttpWebResponse.ContentLength}] bytes is above the max allowable of " + $"[{crawlContext.CrawlConfiguration.MaxPageSizeInBytes}] bytes" } } ; return(new CrawlDecision { Allow = true }); }
private void Map(AuthorizationElement src, CrawlConfiguration dest) { dest.IsAlwaysLogin = src.IsAlwaysLogin; dest.LoginUser = src.LoginUser; dest.LoginPassword = src.LoginPassword; dest.LoginDomain = src.LoginDomain; dest.IsNTLM = src.IsNTLM; }
public async Task Crawl_VerifyCrawlResultIsAsExpected() { var config = new CrawlConfiguration() { IsExternalPageCrawlingEnabled = true }; await base.CrawlAndAssert(new PoliteWebCrawler(config)); }
public PageRequester(ILogger <PageRequester> logger, IHttpClientFactory httpClientFactory, CrawlConfiguration crawlConfiguration, IWebContentExtractor webContentExtractor) { _logger = logger; _httpFactory = httpClientFactory; _client = _httpFactory.CreateClient(); _client.Timeout = TimeSpan.FromMinutes(10); _config = crawlConfiguration; _webContentExtractor = webContentExtractor; }
public void Dispose() { if (_extractor != null) { _extractor.Dispose(); } _cookieContainer = null; _config = null; }
public void ChangeMaxVisits_To10_IsChangedTo10() { // Arrange: var url = new Uri("http://www.uncas.dk"); const int NewMaxVisits = 117; var crawlConfiguration = new CrawlConfiguration( url, NewMaxVisits); // Assert: Assert.AreEqual( NewMaxVisits, crawlConfiguration.MaxVisits); }
public void AddMatches_EmptyList_NoAdditional() { // Arrange: var url = new Uri("http://www.uncas.dk"); var crawlConfiguration = new CrawlConfiguration( url, 10); // Act: crawlConfiguration.AddMatches(null); // Assert: Assert.That( crawlConfiguration.MatchPatterns.Count(), Is.EqualTo(1)); }
public void AddMatches_TwoItemsInList_TwoAdditional() { // Arrange: var url = new Uri("http://www.uncas.dk"); var crawlConfiguration = new CrawlConfiguration( url, 10); // Act: crawlConfiguration.AddMatches(new string[] { "x", "y" }); // Assert: Assert.That( crawlConfiguration.MatchPatterns.Count(), Is.EqualTo(3)); }
public void CrawlConfiguration_WithPatterns_ListIsPopulated() { // Arrange: var url = new Uri("http://www.uncas.dk"); var patterns = new string[] { "x", "y" }; // Act: var crawlConfiguration = new CrawlConfiguration( url, 10, patterns); // Assert: Assert.That( crawlConfiguration.MatchPatterns.Count(), Is.EqualTo(3)); }
/// <summary> /// Parses the command line arguments. /// </summary> /// <param name="args">The command line arguments.</param> /// <returns>The crawl configuration.</returns> /// <remarks> /// Command line arguments, with default values: /// -url http://localhost -maxPages 10. /// </remarks> public static ICrawlConfiguration ParseArguments( IList<string> args) { string url = GetStartUrl(args); int? maxPages = GetMaxPages(args); var result = new CrawlConfiguration( new Uri(url), maxPages); string matches = CombinationParser.GetValue( args, "matches", "matches", string.Empty); if (!string.IsNullOrEmpty(matches)) { string[] matchList = matches.Split( new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); result.AddMatches(matchList); } return result; }
public void ToString_WithMatchPatterns_Ok() { // Arrange: var url = new Uri("http://www.uncas.dk"); var crawlConfiguration = new CrawlConfiguration( url, 10); // Act: string result = crawlConfiguration.ToString(); // Assert: Assert.That( result, Is.StringContaining("http://www.uncas.dk")); }
public void CrawlConfiguration_WithStartUrls_ListIsPopulated() { // Arrange: var url = new Uri("http://www.uncas.dk"); var url2 = new Uri("http://www2.uncas.dk"); var urls = new Uri[] { url, url2 }; // Act: var crawlConfiguration = new CrawlConfiguration( urls, 10); // Assert: Assert.That( crawlConfiguration.MatchPatterns.Count(), Is.EqualTo(2)); }