static void Main(string[] args) { var urisToCrawl = GetSiteToCrawl(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlUrls.txt")); var crawlRuleContent = GetCrawlRuleFileContent(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlRules.txt")); var decisionMaker = new CrawlDecisionMakerWithCrawlRules(crawlRuleContent); XmlConfigurator.Configure(); var config = AbotXConfigurationSectionHandler.LoadFromXml().Convert(); config.IsJavascriptRenderingEnabled = true; config.JavascriptRenderingWaitTimeInMilliseconds = 3000; config.MaxConcurrentSiteCrawls = 1; config.MaxConcurrentThreads = 2; var impls = new ImplementationOverride(config); impls.CrawlDecisionMaker = decisionMaker; var crawler = new CrawlerX(config, impls); crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; foreach (var uriToCrawl in urisToCrawl) { var result = crawler.Crawl(uriToCrawl); } Console.Read(); }
/* ========== Private Members ======== */ /* ======= Class Constructors ======== */ // ? public DataScraper() {} // ? public DataCrawler(CrawlConfigurationX configX) {} #region Public Class Methods /* ================================= Class Methods {Public} ============================ */ /// <summary> /// Static method for crawling. Pass in a configuration /// (i.e. specify how many sites to crawl, whether or not to /// render js, etc) then creates and executes crawler /// </summary> public static async Task Crawl(CrawlConfigurationX configX, HttpClientHandler httpHandler, PageHandlerType pageHandlerType, string uriToCrawl = "http://google.com") { // 'using' sets up scope for crawlerX object to be used // disposes of object at end of scope. (i.e. close-curly-brace) // I saw this used in the github example. Maybe its good practice?? ImplementationContainer impContainer = new ImplementationContainer(); impContainer.PageRequester = new ProxyPageRequester(httpHandler, configX, new WebContentExtractor(), null); ImplementationOverride impOverride = new ImplementationOverride(configX, impContainer); using (var crawlerX = new CrawlerX(configX, impOverride)) { crawlerX.ShouldRenderPageJavascript((CrawledPage, CrawlContext) => { if (CrawledPage.Uri.AbsoluteUri.Contains("ghost")) { return new CrawlDecision { Allow = false, Reason = "scared to render ghost javascript." } } ; return(new CrawlDecision { Allow = true }); }); switch (pageHandlerType) { case PageHandlerType.wordFrequency: //add handler to be called when the crawl for that page is complete crawlerX.PageCrawlCompleted += WordFrequencyHandler; break; case PageHandlerType.sentimentAnalysis: crawlerX.PageCrawlCompleted += SentimentAnalysisHandler; break; } await crawlerX.CrawlAsync(new Uri(uriToCrawl)); } }