/* ========== Private Members ======== */ /* ======= Class Constructors ======== */ // ? public DataScraper() {} // ? public DataCrawler(CrawlConfigurationX configX) {} #region Public Class Methods /* ================================= Class Methods {Public} ============================ */ /// <summary> /// Static method for crawling. Pass in a configuration /// (i.e. specify how many sites to crawl, whether or not to /// render js, etc) then creates and executes crawler /// </summary> public static async Task Crawl(CrawlConfigurationX configX, HttpClientHandler httpHandler, PageHandlerType pageHandlerType, string uriToCrawl = "http://google.com") { // 'using' sets up scope for crawlerX object to be used // disposes of object at end of scope. (i.e. close-curly-brace) // I saw this used in the github example. Maybe its good practice?? ImplementationContainer impContainer = new ImplementationContainer(); impContainer.PageRequester = new ProxyPageRequester(httpHandler, configX, new WebContentExtractor(), null); ImplementationOverride impOverride = new ImplementationOverride(configX, impContainer); using (var crawlerX = new CrawlerX(configX, impOverride)) { crawlerX.ShouldRenderPageJavascript((CrawledPage, CrawlContext) => { if (CrawledPage.Uri.AbsoluteUri.Contains("ghost")) { return new CrawlDecision { Allow = false, Reason = "scared to render ghost javascript." } } ; return(new CrawlDecision { Allow = true }); }); switch (pageHandlerType) { case PageHandlerType.wordFrequency: //add handler to be called when the crawl for that page is complete crawlerX.PageCrawlCompleted += WordFrequencyHandler; break; case PageHandlerType.sentimentAnalysis: crawlerX.PageCrawlCompleted += SentimentAnalysisHandler; break; } await crawlerX.CrawlAsync(new Uri(uriToCrawl)); } }
public void Run() { var resources = SqlHelper.UnloadResources(); if (!resources.Any()) { return; } var siteToCrawls = new List <SiteToCrawl>(); foreach (var res in resources) { for (var i = 0; i < 10; i++) { siteToCrawls.Add(new SiteToCrawl { Uri = new Uri(string.Format(_urlPattern, res, 10 * i)), SiteBag = new { Name = res, Number = i + 1 } }); } } CrawlConfigurationX config = AbotXConfigurationSectionHandler.LoadFromXml().Convert(); XmlConfigurator.Configure();//So the logger var siteToCrawlProvider = new SiteToCrawlProvider(); siteToCrawlProvider.AddSitesToCrawl(siteToCrawls); //Create the crawl engine instance var impls = new ParallelImplementationOverride( config, new ParallelImplementationContainer { SiteToCrawlProvider = siteToCrawlProvider } ); _crawlerEngine = new ParallelCrawlerEngine(config, impls); //Register for site level events _crawlerEngine.AllCrawlsCompleted += (sender, eventArgs) => { Console.WriteLine("Completed crawling all sites"); _crawlerEngine.Stop(true); Run(); }; _crawlerEngine.SiteCrawlCompleted += (sender, eventArgs) => { Console.WriteLine("Completed crawling site {0}", eventArgs.CrawledSite.SiteToCrawl.Uri); }; _crawlerEngine.CrawlerInstanceCreated += (sender, eventArgs) => { eventArgs.Crawler.CrawlBag = eventArgs.SiteToCrawl.SiteBag; //Register for crawler level events. These are Abot's events!!! eventArgs.Crawler.PageCrawlCompleted += (abotSender, abotEventArgs) => { var crawlX = abotSender as CrawlerX; CrawledPage crawledPage = abotEventArgs.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { if (string.IsNullOrEmpty(crawledPage.Content.Text)) { Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); } else { try { if (crawledPage.CrawlDepth == 1) { Console.WriteLine("Depth: {0} --- Crawl of page succeeded {1}", crawledPage.CrawlDepth, crawledPage.Uri.AbsoluteUri); var item = new CrawledItem() { Name = crawlX.CrawlBag.Name, PageNumber = crawlX.CrawlBag.Number, Url = crawledPage.Uri.AbsoluteUri, Detail = crawledPage.Content.Text }; SqlHelper.Store(new System.Collections.Generic.List <CrawledItem>() { item }); } } catch (Exception e) { Console.WriteLine(e.Message); } } } //var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser }; eventArgs.Crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { CrawlDecision decision = new CrawlDecision { Allow = true }; var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd"); var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link"); if (isCrawlDepth1 || isCrawlDepth2) { return new CrawlDecision { Allow = false, Reason = "Dont want to crawl google pages" } } ; return(decision); }); }; _crawlerEngine.StartAsync(); }
// PRIVATE CLASS MEMBERS // MAIN =================================================================================== /// <param name="args"> Command line arguements passed to executable. </param> static async Task Main(string[] args) { // Creates a Logger object from Serilog. Writes up to Debug level prints. SetupLogger(); Log.Logger.Information("Darkweb Data Scraper start..."); // Parses command line arguements and stores them in "parsedArgs" SetupParser(args); // I made this function to move this setup out of main. // Returns a TorSharpSettings object for use with TorSharp. var settings = SetupTorSharpSettings(); // Idk exactly how this works but like... its for torsharp // its uh... setting up torsharp "tools"...also its asyncronous await SetupTorSharpTools(settings); // * starts tor proxy ----------------------------------------------------------------- using (var proxy = new TorSharpProxy(settings)) { var waiting = true; while (waiting) { // block untill we wait for TorSharp Proxy to be configured await proxy.ConfigureAndStartAsync(); waiting = false; } // * SETUP AND EXECUTE CRAWLER ================================================ // Setup Crawler configuration CrawlConfigurationX crawlConfig = new CrawlConfigurationX { MaxPagesToCrawl = 30, // Max total urls this crawler should crawl MaxCrawlDepth = 1, // Depth for crawler to traverse urls IsJavascriptRenderingEnabled = false, // Should crawler render JS? JavascriptRenderingWaitTimeInMilliseconds = 2000, // How long to wait for js to process MaxConcurrentSiteCrawls = 1, // Only crawl a single site at a time MaxRetryCount = 3 // Retries to connect and crawl site 'x' times }; if (parsedArgs.InputFile == null) // THIS IS "-s" or "--single" { var handler = new HttpClientHandler { Proxy = new WebProxy(new Uri("http://localhost:" + settings.PrivoxySettings.Port)) }; // Crawl await DataScraper.Crawl(crawlConfig, handler, parsedArgs.handlerType, parsedArgs.StartingUri); BuildBsonDocument(DataScraper.allParsedText, parsedArgs.StartingUri); //reset vals for next crawl DataScraper.allParsedText = new List <string>(); DataScraper.siteTitle = ""; DataScraper.dataDocuments = new List <BsonDocument>(); if (MONGO_URI == "") { Log.Logger.Information("Database information is no longer accessible or available." + "You will need to provide your own Mongo details in \"Crawler.cs\"."); } } else // THIS IS "-m" or "--multi" { string inputFilePath = @parsedArgs.InputFile; var sitesToCrawl = GenerateSiteList(inputFilePath); for (int i = 0; i < sitesToCrawl.Count; i++) { var handler = new HttpClientHandler { Proxy = new WebProxy(new Uri("http://localhost:" + settings.PrivoxySettings.Port)) }; // Crawl await DataScraper.Crawl(crawlConfig, handler, parsedArgs.handlerType, sitesToCrawl[i]); BuildBsonDocument(DataScraper.allParsedText, sitesToCrawl[i]); if (MONGO_URI == "") { Log.Logger.Information("Database information is no longer accessible or available." + "You will need to provide your own Mongo details in \"Crawler.cs\"."); } if (MONGO_URI != "" && MONGO_COLLECTION_NAME != "" && MONGO_DATABASE_NAME != "") { var client = new MongoClient(MONGO_URI); var database = client.GetDatabase(MONGO_DATABASE_NAME); var collection = database.GetCollection <BsonDocument>(MONGO_COLLECTION_NAME); collection.InsertMany(DataScraper.dataDocuments); } //reset vals for next crawl DataScraper.allParsedText = new List <string>(); DataScraper.siteTitle = ""; DataScraper.dataDocuments = new List <BsonDocument>(); } } // * ========================================================================== // Stop the TorSharp tools so that the proxy is no longer listening on the configured port. proxy.Stop(); } // * ---------------------------------------------------------------------------------- }