static void Main(string[] args) { var urisToCrawl = GetSiteToCrawl(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlUrls.txt")); var crawlRuleContent = GetCrawlRuleFileContent(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlRules.txt")); var decisionMaker = new CrawlDecisionMakerWithCrawlRules(crawlRuleContent); XmlConfigurator.Configure(); var config = AbotXConfigurationSectionHandler.LoadFromXml().Convert(); config.IsJavascriptRenderingEnabled = true; config.JavascriptRenderingWaitTimeInMilliseconds = 3000; config.MaxConcurrentSiteCrawls = 1; config.MaxConcurrentThreads = 2; var impls = new ImplementationOverride(config); impls.CrawlDecisionMaker = decisionMaker; var crawler = new CrawlerX(config, impls); crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; foreach (var uriToCrawl in urisToCrawl) { var result = crawler.Crawl(uriToCrawl); } Console.Read(); }
public void Run() { var resources = SqlHelper.UnloadResources(); if (!resources.Any()) { return; } var siteToCrawls = new List <SiteToCrawl>(); foreach (var res in resources) { for (var i = 0; i < 10; i++) { siteToCrawls.Add(new SiteToCrawl { Uri = new Uri(string.Format(_urlPattern, res, 10 * i)), SiteBag = new { Name = res, Number = i + 1 } }); } } CrawlConfigurationX config = AbotXConfigurationSectionHandler.LoadFromXml().Convert(); XmlConfigurator.Configure();//So the logger var siteToCrawlProvider = new SiteToCrawlProvider(); siteToCrawlProvider.AddSitesToCrawl(siteToCrawls); //Create the crawl engine instance var impls = new ParallelImplementationOverride( config, new ParallelImplementationContainer { SiteToCrawlProvider = siteToCrawlProvider } ); _crawlerEngine = new ParallelCrawlerEngine(config, impls); //Register for site level events _crawlerEngine.AllCrawlsCompleted += (sender, eventArgs) => { Console.WriteLine("Completed crawling all sites"); _crawlerEngine.Stop(true); Run(); }; _crawlerEngine.SiteCrawlCompleted += (sender, eventArgs) => { Console.WriteLine("Completed crawling site {0}", eventArgs.CrawledSite.SiteToCrawl.Uri); }; _crawlerEngine.CrawlerInstanceCreated += (sender, eventArgs) => { eventArgs.Crawler.CrawlBag = eventArgs.SiteToCrawl.SiteBag; //Register for crawler level events. These are Abot's events!!! eventArgs.Crawler.PageCrawlCompleted += (abotSender, abotEventArgs) => { var crawlX = abotSender as CrawlerX; CrawledPage crawledPage = abotEventArgs.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { if (string.IsNullOrEmpty(crawledPage.Content.Text)) { Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); } else { try { if (crawledPage.CrawlDepth == 1) { Console.WriteLine("Depth: {0} --- Crawl of page succeeded {1}", crawledPage.CrawlDepth, crawledPage.Uri.AbsoluteUri); var item = new CrawledItem() { Name = crawlX.CrawlBag.Name, PageNumber = crawlX.CrawlBag.Number, Url = crawledPage.Uri.AbsoluteUri, Detail = crawledPage.Content.Text }; SqlHelper.Store(new System.Collections.Generic.List <CrawledItem>() { item }); } } catch (Exception e) { Console.WriteLine(e.Message); } } } //var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser }; eventArgs.Crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { CrawlDecision decision = new CrawlDecision { Allow = true }; var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd"); var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link"); if (isCrawlDepth1 || isCrawlDepth2) { return new CrawlDecision { Allow = false, Reason = "Dont want to crawl google pages" } } ; return(decision); }); }; _crawlerEngine.StartAsync(); }