static void Main(string[] args) { var urisToCrawl = GetSiteToCrawl(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlUrls.txt")); var crawlRuleContent = GetCrawlRuleFileContent(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlRules.txt")); var decisionMaker = new CrawlDecisionMakerWithCrawlRules(crawlRuleContent); XmlConfigurator.Configure(); var config = AbotXConfigurationSectionHandler.LoadFromXml().Convert(); config.IsJavascriptRenderingEnabled = true; config.JavascriptRenderingWaitTimeInMilliseconds = 3000; config.MaxConcurrentSiteCrawls = 1; config.MaxConcurrentThreads = 2; var impls = new ImplementationOverride(config); impls.CrawlDecisionMaker = decisionMaker; var crawler = new CrawlerX(config, impls); crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; foreach (var uriToCrawl in urisToCrawl) { var result = crawler.Crawl(uriToCrawl); } Console.Read(); }
public IHttpActionResult Get(string url) { baseUrl = url = url.StartsWith("http") ? url : $"http://{url}"; var crawler = new CrawlerX(); var uri = new Uri(url); crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; var result = crawler.Crawl(uri); if (result.ErrorOccurred || exceptionCounter > 0) { throw new Exception($"Error occured while saving Images from {url}"); } return(Json(imageUrls)); }
public void Start() { if (!init) { throw new AgetNotInitializedException(); } lock (this) { log.Debug($"Starting crawler with id {guid}"); if (isRunning) { log.Info($"Crawler with id {guid} is already started"); return; } log.Debug($"Initializing CrawlerX"); isRunning = true; agent = new CrawlerX(); agent.PageCrawlCompleted += Agent_PageCrawlCompleted; agent.PageLinksCrawlDisallowed += Agent_PageLinksCrawlDisallowed; //agent.ShouldCrawlPage(ShouldCrawlPage); (new Thread(() => { log.Debug("Trying to start CrawlX"); using (var dbContext = new ApplicationDbContext()) { var site = dbContext.Sites.FirstOrDefault(m => m.Id == siteId); agent.Crawl(new Uri(site.BaseUrl)); log.Info("Crawling is done"); lock (this) isRunning = false; log.Debug("Calling manager"); manager.Done(this); } })).Start(); } }