public HomeController() { _crawler = new CrawlerX(); _crawler.PageCrawlCompletedAsync += SaveWebsiteUriRequestResult; _crawler.PageCrawlDisallowedAsync += SaveWebsiteUriRequestResult; _crawler.PageLinksCrawlDisallowedAsync += SaveWebsiteUriRequestResult; }
static void Main(string[] args) { var urisToCrawl = GetSiteToCrawl(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlUrls.txt")); var crawlRuleContent = GetCrawlRuleFileContent(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlRules.txt")); var decisionMaker = new CrawlDecisionMakerWithCrawlRules(crawlRuleContent); XmlConfigurator.Configure(); var config = AbotXConfigurationSectionHandler.LoadFromXml().Convert(); config.IsJavascriptRenderingEnabled = true; config.JavascriptRenderingWaitTimeInMilliseconds = 3000; config.MaxConcurrentSiteCrawls = 1; config.MaxConcurrentThreads = 2; var impls = new ImplementationOverride(config); impls.CrawlDecisionMaker = decisionMaker; var crawler = new CrawlerX(config, impls); crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; foreach (var uriToCrawl in urisToCrawl) { var result = crawler.Crawl(uriToCrawl); } Console.Read(); }
/* ========== Private Members ======== */ /* ======= Class Constructors ======== */ // ? public DataScraper() {} // ? public DataCrawler(CrawlConfigurationX configX) {} #region Public Class Methods /* ================================= Class Methods {Public} ============================ */ /// <summary> /// Static method for crawling. Pass in a configuration /// (i.e. specify how many sites to crawl, whether or not to /// render js, etc) then creates and executes crawler /// </summary> public static async Task Crawl(CrawlConfigurationX configX, HttpClientHandler httpHandler, PageHandlerType pageHandlerType, string uriToCrawl = "http://google.com") { // 'using' sets up scope for crawlerX object to be used // disposes of object at end of scope. (i.e. close-curly-brace) // I saw this used in the github example. Maybe its good practice?? ImplementationContainer impContainer = new ImplementationContainer(); impContainer.PageRequester = new ProxyPageRequester(httpHandler, configX, new WebContentExtractor(), null); ImplementationOverride impOverride = new ImplementationOverride(configX, impContainer); using (var crawlerX = new CrawlerX(configX, impOverride)) { crawlerX.ShouldRenderPageJavascript((CrawledPage, CrawlContext) => { if (CrawledPage.Uri.AbsoluteUri.Contains("ghost")) { return new CrawlDecision { Allow = false, Reason = "scared to render ghost javascript." } } ; return(new CrawlDecision { Allow = true }); }); switch (pageHandlerType) { case PageHandlerType.wordFrequency: //add handler to be called when the crawl for that page is complete crawlerX.PageCrawlCompleted += WordFrequencyHandler; break; case PageHandlerType.sentimentAnalysis: crawlerX.PageCrawlCompleted += SentimentAnalysisHandler; break; } await crawlerX.CrawlAsync(new Uri(uriToCrawl)); } }
public IHttpActionResult Get(string url) { baseUrl = url = url.StartsWith("http") ? url : $"http://{url}"; var crawler = new CrawlerX(); var uri = new Uri(url); crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; var result = crawler.Crawl(uri); if (result.ErrorOccurred || exceptionCounter > 0) { throw new Exception($"Error occured while saving Images from {url}"); } return(Json(imageUrls)); }
public void Start() { if (!init) { throw new AgetNotInitializedException(); } lock (this) { log.Debug($"Starting crawler with id {guid}"); if (isRunning) { log.Info($"Crawler with id {guid} is already started"); return; } log.Debug($"Initializing CrawlerX"); isRunning = true; agent = new CrawlerX(); agent.PageCrawlCompleted += Agent_PageCrawlCompleted; agent.PageLinksCrawlDisallowed += Agent_PageLinksCrawlDisallowed; //agent.ShouldCrawlPage(ShouldCrawlPage); (new Thread(() => { log.Debug("Trying to start CrawlX"); using (var dbContext = new ApplicationDbContext()) { var site = dbContext.Sites.FirstOrDefault(m => m.Id == siteId); agent.Crawl(new Uri(site.BaseUrl)); log.Info("Crawling is done"); lock (this) isRunning = false; log.Debug("Calling manager"); manager.Done(this); } })).Start(); } }