public HomeController()
 {
     _crawler = new CrawlerX();
     _crawler.PageCrawlCompletedAsync       += SaveWebsiteUriRequestResult;
     _crawler.PageCrawlDisallowedAsync      += SaveWebsiteUriRequestResult;
     _crawler.PageLinksCrawlDisallowedAsync += SaveWebsiteUriRequestResult;
 }
Exemple #2
0
        static void Main(string[] args)
        {
            var urisToCrawl      = GetSiteToCrawl(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlUrls.txt"));
            var crawlRuleContent = GetCrawlRuleFileContent(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlRules.txt"));
            var decisionMaker    = new CrawlDecisionMakerWithCrawlRules(crawlRuleContent);

            XmlConfigurator.Configure();

            var config = AbotXConfigurationSectionHandler.LoadFromXml().Convert();

            config.IsJavascriptRenderingEnabled = true;
            config.JavascriptRenderingWaitTimeInMilliseconds = 3000;
            config.MaxConcurrentSiteCrawls = 1;
            config.MaxConcurrentThreads    = 2;

            var impls = new ImplementationOverride(config);

            impls.CrawlDecisionMaker = decisionMaker;
            var crawler = new CrawlerX(config, impls);

            crawler.PageCrawlStarting        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompleted       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowed      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;

            foreach (var uriToCrawl in urisToCrawl)
            {
                var result = crawler.Crawl(uriToCrawl);
            }

            Console.Read();
        }
        /* ========== Private Members ======== */


        /* ======= Class Constructors ======== */
        // ? public DataScraper() {}
        // ? public DataCrawler(CrawlConfigurationX configX) {}

        #region Public Class Methods
        /* ================================= Class Methods {Public} ============================ */

        /// <summary>
        /// Static method for crawling. Pass in a configuration
        /// (i.e. specify how many sites to crawl, whether or not to
        /// render js, etc) then creates and executes crawler
        /// </summary>
        public static async Task Crawl(CrawlConfigurationX configX, HttpClientHandler httpHandler, PageHandlerType pageHandlerType, string uriToCrawl = "http://google.com")
        {
            // 'using' sets up scope for crawlerX object to be used
            // disposes of object at end of scope. (i.e. close-curly-brace)
            // I saw this used in the github example. Maybe its good practice??

            ImplementationContainer impContainer = new ImplementationContainer();

            impContainer.PageRequester = new ProxyPageRequester(httpHandler, configX, new WebContentExtractor(), null);

            ImplementationOverride impOverride = new ImplementationOverride(configX, impContainer);

            using (var crawlerX = new CrawlerX(configX, impOverride))
            {
                crawlerX.ShouldRenderPageJavascript((CrawledPage, CrawlContext) =>
                {
                    if (CrawledPage.Uri.AbsoluteUri.Contains("ghost"))
                    {
                        return new CrawlDecision {
                            Allow = false, Reason = "scared to render ghost javascript."
                        }
                    }
                    ;

                    return(new CrawlDecision {
                        Allow = true
                    });
                });

                switch (pageHandlerType)
                {
                case PageHandlerType.wordFrequency:
                    //add handler to be called when the crawl for that page is complete
                    crawlerX.PageCrawlCompleted += WordFrequencyHandler;
                    break;

                case PageHandlerType.sentimentAnalysis:
                    crawlerX.PageCrawlCompleted += SentimentAnalysisHandler;
                    break;
                }

                await crawlerX.CrawlAsync(new Uri(uriToCrawl));
            }
        }
Exemple #4
0
        public IHttpActionResult Get(string url)
        {
            baseUrl = url = url.StartsWith("http") ? url : $"http://{url}";

            var crawler = new CrawlerX();

            var uri = new Uri(url);

            crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted;

            var result = crawler.Crawl(uri);

            if (result.ErrorOccurred || exceptionCounter > 0)
            {
                throw new Exception($"Error occured while saving Images from {url}");
            }

            return(Json(imageUrls));
        }
        public void Start()
        {
            if (!init)
            {
                throw new AgetNotInitializedException();
            }
            lock (this)
            {
                log.Debug($"Starting crawler with id {guid}");
                if (isRunning)
                {
                    log.Info($"Crawler with id {guid} is already started");
                    return;
                }
                log.Debug($"Initializing CrawlerX");
                isRunning = true;
                agent     = new CrawlerX();
                agent.PageCrawlCompleted       += Agent_PageCrawlCompleted;
                agent.PageLinksCrawlDisallowed += Agent_PageLinksCrawlDisallowed;

                //agent.ShouldCrawlPage(ShouldCrawlPage);

                (new Thread(() =>
                {
                    log.Debug("Trying to start CrawlX");
                    using (var dbContext = new ApplicationDbContext())
                    {
                        var site = dbContext.Sites.FirstOrDefault(m => m.Id == siteId);
                        agent.Crawl(new Uri(site.BaseUrl));

                        log.Info("Crawling is done");
                        lock (this) isRunning = false;
                        log.Debug("Calling manager");
                        manager.Done(this);
                    }
                })).Start();
            }
        }