示例#1
0
        static void Main(string[] args)
        {
            var urisToCrawl      = GetSiteToCrawl(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlUrls.txt"));
            var crawlRuleContent = GetCrawlRuleFileContent(Path.Combine(System.AppDomain.CurrentDomain.BaseDirectory, @"FAQ\CrawlRules.txt"));
            var decisionMaker    = new CrawlDecisionMakerWithCrawlRules(crawlRuleContent);

            XmlConfigurator.Configure();

            var config = AbotXConfigurationSectionHandler.LoadFromXml().Convert();

            config.IsJavascriptRenderingEnabled = true;
            config.JavascriptRenderingWaitTimeInMilliseconds = 3000;
            config.MaxConcurrentSiteCrawls = 1;
            config.MaxConcurrentThreads    = 2;

            var impls = new ImplementationOverride(config);

            impls.CrawlDecisionMaker = decisionMaker;
            var crawler = new CrawlerX(config, impls);

            crawler.PageCrawlStarting        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompleted       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowed      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;

            foreach (var uriToCrawl in urisToCrawl)
            {
                var result = crawler.Crawl(uriToCrawl);
            }

            Console.Read();
        }
        /* ========== Private Members ======== */


        /* ======= Class Constructors ======== */
        // ? public DataScraper() {}
        // ? public DataCrawler(CrawlConfigurationX configX) {}

        #region Public Class Methods
        /* ================================= Class Methods {Public} ============================ */

        /// <summary>
        /// Static method for crawling. Pass in a configuration
        /// (i.e. specify how many sites to crawl, whether or not to
        /// render js, etc) then creates and executes crawler
        /// </summary>
        public static async Task Crawl(CrawlConfigurationX configX, HttpClientHandler httpHandler, PageHandlerType pageHandlerType, string uriToCrawl = "http://google.com")
        {
            // 'using' sets up scope for crawlerX object to be used
            // disposes of object at end of scope. (i.e. close-curly-brace)
            // I saw this used in the github example. Maybe its good practice??

            ImplementationContainer impContainer = new ImplementationContainer();

            impContainer.PageRequester = new ProxyPageRequester(httpHandler, configX, new WebContentExtractor(), null);

            ImplementationOverride impOverride = new ImplementationOverride(configX, impContainer);

            using (var crawlerX = new CrawlerX(configX, impOverride))
            {
                crawlerX.ShouldRenderPageJavascript((CrawledPage, CrawlContext) =>
                {
                    if (CrawledPage.Uri.AbsoluteUri.Contains("ghost"))
                    {
                        return new CrawlDecision {
                            Allow = false, Reason = "scared to render ghost javascript."
                        }
                    }
                    ;

                    return(new CrawlDecision {
                        Allow = true
                    });
                });

                switch (pageHandlerType)
                {
                case PageHandlerType.wordFrequency:
                    //add handler to be called when the crawl for that page is complete
                    crawlerX.PageCrawlCompleted += WordFrequencyHandler;
                    break;

                case PageHandlerType.sentimentAnalysis:
                    crawlerX.PageCrawlCompleted += SentimentAnalysisHandler;
                    break;
                }

                await crawlerX.CrawlAsync(new Uri(uriToCrawl));
            }
        }