示例#1
0
        static void Main(string[] args)
        {
            var urisToCrawl      = GetSiteToCrawl(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlUrls.txt"));
            var crawlRuleContent = GetCrawlRuleFileContent(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlRules.txt"));
            var decisionMaker    = new CrawlDecisionMakerWithCrawlRules(crawlRuleContent);

            XmlConfigurator.Configure();

            var config = AbotXConfigurationSectionHandler.LoadFromXml().Convert();

            config.IsJavascriptRenderingEnabled = true;
            config.JavascriptRenderingWaitTimeInMilliseconds = 3000;
            config.MaxConcurrentSiteCrawls = 1;
            config.MaxConcurrentThreads    = 2;

            var impls = new ImplementationOverride(config);

            impls.CrawlDecisionMaker = decisionMaker;
            var crawler = new CrawlerX(config, impls);

            crawler.PageCrawlStarting        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompleted       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowed      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;

            foreach (var uriToCrawl in urisToCrawl)
            {
                var result = crawler.Crawl(uriToCrawl);
            }

            Console.Read();
        }
示例#2
0
        public IHttpActionResult Get(string url)
        {
            baseUrl = url = url.StartsWith("http") ? url : $"http://{url}";

            var crawler = new CrawlerX();

            var uri = new Uri(url);

            crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted;

            var result = crawler.Crawl(uri);

            if (result.ErrorOccurred || exceptionCounter > 0)
            {
                throw new Exception($"Error occured while saving Images from {url}");
            }

            return(Json(imageUrls));
        }
        public void Start()
        {
            if (!init)
            {
                throw new AgetNotInitializedException();
            }
            lock (this)
            {
                log.Debug($"Starting crawler with id {guid}");
                if (isRunning)
                {
                    log.Info($"Crawler with id {guid} is already started");
                    return;
                }
                log.Debug($"Initializing CrawlerX");
                isRunning = true;
                agent     = new CrawlerX();
                agent.PageCrawlCompleted       += Agent_PageCrawlCompleted;
                agent.PageLinksCrawlDisallowed += Agent_PageLinksCrawlDisallowed;

                //agent.ShouldCrawlPage(ShouldCrawlPage);

                (new Thread(() =>
                {
                    log.Debug("Trying to start CrawlX");
                    using (var dbContext = new ApplicationDbContext())
                    {
                        var site = dbContext.Sites.FirstOrDefault(m => m.Id == siteId);
                        agent.Crawl(new Uri(site.BaseUrl));

                        log.Info("Crawling is done");
                        lock (this) isRunning = false;
                        log.Debug("Calling manager");
                        manager.Done(this);
                    }
                })).Start();
            }
        }