/* ========== Private Members ======== */ /* ======= Class Constructors ======== */ // ? public DataScraper() {} // ? public DataCrawler(CrawlConfigurationX configX) {} #region Public Class Methods /* ================================= Class Methods {Public} ============================ */ /// <summary> /// Static method for crawling. Pass in a configuration /// (i.e. specify how many sites to crawl, whether or not to /// render js, etc) then creates and executes crawler /// </summary> public static async Task Crawl(CrawlConfigurationX configX, HttpClientHandler httpHandler, PageHandlerType pageHandlerType, string uriToCrawl = "http://google.com") { // 'using' sets up scope for crawlerX object to be used // disposes of object at end of scope. (i.e. close-curly-brace) // I saw this used in the github example. Maybe its good practice?? ImplementationContainer impContainer = new ImplementationContainer(); impContainer.PageRequester = new ProxyPageRequester(httpHandler, configX, new WebContentExtractor(), null); ImplementationOverride impOverride = new ImplementationOverride(configX, impContainer); using (var crawlerX = new CrawlerX(configX, impOverride)) { crawlerX.ShouldRenderPageJavascript((CrawledPage, CrawlContext) => { if (CrawledPage.Uri.AbsoluteUri.Contains("ghost")) { return new CrawlDecision { Allow = false, Reason = "scared to render ghost javascript." } } ; return(new CrawlDecision { Allow = true }); }); switch (pageHandlerType) { case PageHandlerType.wordFrequency: //add handler to be called when the crawl for that page is complete crawlerX.PageCrawlCompleted += WordFrequencyHandler; break; case PageHandlerType.sentimentAnalysis: crawlerX.PageCrawlCompleted += SentimentAnalysisHandler; break; } await crawlerX.CrawlAsync(new Uri(uriToCrawl)); } }
void SetOrCreateWebsite() { _website = _websitesContext.Websites.Where(w => w.Name == _url.AbsoluteUri).SingleOrDefault(); if (_website == null) { _website = new Website { Name = _url.AbsoluteUri } } ; } async Task <bool> CrawlAsync() { _sitePages = new List <WebPage>(); var result = await _crawler.CrawlAsync(_url); return(!result.ErrorOccurred); }