public WebCrawler(List <string> CNNRules, List <string> BleacherReportRules) { LoadQueue = CloudConfiguration.GetLoadingQueue(); CrawlQueue = CloudConfiguration.GetCrawlingQueue(); Table = CloudConfiguration.GetTable(); this.CNNRules = CNNRules; this.BleacherReportRules = BleacherReportRules; this.VisitedLinks = new HashSet <string>(); OldestAllowed = new DateTime(2016, 12, 1); BadExtensions = new List <string> { ".jpg" }; }
public override void Run() { CloudQueue LoadQueue = CloudConfiguration.GetLoadingQueue(); CloudQueue CrawlQueue = CloudConfiguration.GetCrawlingQueue(); CloudQueue StopQueue = CloudConfiguration.GetStopQueue(); CloudTable Table = CloudConfiguration.GetTable(); List <string> CNNRules = ProcessRobots("http://www.cnn.com/robots.txt"); List <string> BleacherReportRules = ProcessRobots("http://www.bleacherreport.com/robots.txt"); WebCrawler Crawler = new WebCrawler(CNNRules, BleacherReportRules); State = "Idle"; Thread.Sleep(10000); CloudQueueMessage stopMessage = StopQueue.GetMessage(); CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total"); MemCount = new PerformanceCounter("Memory", "Available MBytes"); while (true) { while (stopMessage == null) { // Get the next message CloudQueueMessage loadMessage = LoadQueue.GetMessage(); if (loadMessage != null) { State = "Loading"; string message = loadMessage.AsString; Crawler.ProcessURL(message); LoadQueue.DeleteMessage(loadMessage); } else if (State.Equals("Loading") || State.Equals("Crawling")) { CloudQueueMessage crawlMessage = CrawlQueue.GetMessage(); // dequeue crawl message if (crawlMessage != null) { State = "Crawling"; Crawler.ProcessURL(crawlMessage.AsString); CrawlQueue.DeleteMessage(crawlMessage); } } stopMessage = StopQueue.GetMessage(); } State = "Idle"; } }
public override void Run() { Storage = new AzureStorage(); LoadQueue = CloudConfiguration.GetLoadingQueue(); CrawlQueue = CloudConfiguration.GetCrawlingQueue(); StopQueue = CloudConfiguration.GetStopQueue(); SiteDataTable = CloudConfiguration.GetSiteDataTable(); AdminStatusTable = CloudConfiguration.GetAdminStatusTable(); StateQueue = CloudConfiguration.GetStateQueue(); State = "Idle"; CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total"); MemCount = new PerformanceCounter("Memory", "Available MBytes"); Status = new AdminStatus(State, (int)CPUCount.NextValue(), (int)MemCount.NextValue()); string[] robots = { "http://www.cnn.com/robots.txt", "http://www.bleacherreport.com/robots.txt" }; Crawler = new WebCrawler(robots, Storage); Thread.Sleep(10000); string url = ""; while (true) { CloudQueueMessage stopMessage = StopQueue.GetMessage(); while (stopMessage == null) { // Get the next message CloudQueueMessage loadMessage = LoadQueue.GetMessage(); State = "Loading"; if (loadMessage != null) { State = "Loading"; url = loadMessage.AsString; if (url.Contains("robots.txt")) { string[] robotLinks = url.Split(null); foreach (string link in robotLinks) { Crawler.ProcessURL(link); } LoadQueue.DeleteMessage(loadMessage); } else { Crawler.ProcessURL(url); } } else if (State.Equals("Loading") || State.Equals("Crawling")) { CloudQueueMessage crawlMessage = CrawlQueue.GetMessage(); // dequeue crawl message if (crawlMessage != null) { State = "Crawling"; url = crawlMessage.AsString; Crawler.ProcessURL(url); CrawlQueue.DeleteMessage(crawlMessage); } } stopMessage = StopQueue.GetMessage(); UpdateDashboard(url); } State = "Idle"; } }