private async Task RunAsync(CancellationToken cancellationToken) { CloudStorageAccount storageAccount = CloudStorageAccount.Parse( ConfigurationManager.AppSettings["StorageConnectionString"] ); CloudQueueClient queueClient = storageAccount.CreateCloudQueueClient(); CloudQueue commandQueue = queueClient.GetQueueReference(CommandMessage.QUEUE_COMMAND); commandQueue.CreateIfNotExists(); CloudQueue urlQueue = queueClient.GetQueueReference(UrlMessage.QUEUE_URL); commandQueue.CreateIfNotExists(); while (!cancellationToken.IsCancellationRequested) { CloudQueueMessage commandMessage = commandQueue.GetMessage(TimeSpan.FromMinutes(5)); if (commandMessage != null) { if (commandMessage.AsString == CommandMessage.COMMAND_LOAD) { workerStateMachine.setState(WorkerStateMachine.STATE_LOADING); webLoader = new WebLoader(); } else if (commandMessage.AsString == CommandMessage.COMMAND_IDLE) { workerStateMachine.setState(WorkerStateMachine.STATE_IDLE); } else if (commandMessage.AsString == CommandMessage.COMMAND_CRAWL) { workerStateMachine.setState(WorkerStateMachine.STATE_CRAWLING); webCrawler = new WebCrawler(statsManager); } commandQueue.DeleteMessage(commandMessage); } if (workerStateMachine.getState() != WorkerStateMachine.STATE_IDLE) // in a loading or crawling state { CloudQueueMessage urlMessage = urlQueue.GetMessage(); if (urlMessage != null) // got url from queue of sitemap or urlset { // load or crawl with UrlEntity depending on current state UrlMessage urlEntity = UrlMessage.Parse(urlMessage.AsString); bool deleteMessage = workerStateMachine.Act(urlEntity); if (deleteMessage) { urlQueue.DeleteMessage(urlMessage); } } else { workerStateMachine.Act(null); // need to call Act(null) to finish crawling one day } } await Task.Delay(100); } }
public override void Run() { CloudQueue LoadQueue = CloudConfiguration.GetLoadingQueue(); CloudQueue CrawlQueue = CloudConfiguration.GetCrawlingQueue(); CloudQueue StopQueue = CloudConfiguration.GetStopQueue(); CloudTable Table = CloudConfiguration.GetTable(); List <string> CNNRules = ProcessRobots("http://www.cnn.com/robots.txt"); List <string> BleacherReportRules = ProcessRobots("http://www.bleacherreport.com/robots.txt"); WebCrawler Crawler = new WebCrawler(CNNRules, BleacherReportRules); State = "Idle"; Thread.Sleep(10000); CloudQueueMessage stopMessage = StopQueue.GetMessage(); CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total"); MemCount = new PerformanceCounter("Memory", "Available MBytes"); while (true) { while (stopMessage == null) { // Get the next message CloudQueueMessage loadMessage = LoadQueue.GetMessage(); if (loadMessage != null) { State = "Loading"; string message = loadMessage.AsString; Crawler.ProcessURL(message); LoadQueue.DeleteMessage(loadMessage); } else if (State.Equals("Loading") || State.Equals("Crawling")) { CloudQueueMessage crawlMessage = CrawlQueue.GetMessage(); // dequeue crawl message if (crawlMessage != null) { State = "Crawling"; Crawler.ProcessURL(crawlMessage.AsString); CrawlQueue.DeleteMessage(crawlMessage); } } stopMessage = StopQueue.GetMessage(); } State = "Idle"; } }
public override bool OnStart() { // Set the maximum number of concurrent connections ServicePointManager.DefaultConnectionLimit = 12; // For information on handling configuration changes // see the MSDN topic at https://go.microsoft.com/fwlink/?LinkId=166357. bool result = base.OnStart(); Trace.TraceInformation("WorkerRole1 has been started"); int instanceID; string instanceId = RoleEnvironment.CurrentRoleInstance.Id; if (int.TryParse(instanceId.Substring(instanceId.LastIndexOf(".") + 1), out instanceID)) // On cloud. { int.TryParse(instanceId.Substring(instanceId.LastIndexOf("_") + 1), out instanceID); // On compute emulator. } webCrawler = new WebCrawler(statsManager); workerStateMachine = new WorkerStateMachine(instanceID.ToString()); return(result); }
public override void Run() { Storage = new AzureStorage(); LoadQueue = CloudConfiguration.GetLoadingQueue(); CrawlQueue = CloudConfiguration.GetCrawlingQueue(); StopQueue = CloudConfiguration.GetStopQueue(); SiteDataTable = CloudConfiguration.GetSiteDataTable(); AdminStatusTable = CloudConfiguration.GetAdminStatusTable(); StateQueue = CloudConfiguration.GetStateQueue(); State = "Idle"; CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total"); MemCount = new PerformanceCounter("Memory", "Available MBytes"); Status = new AdminStatus(State, (int)CPUCount.NextValue(), (int)MemCount.NextValue()); string[] robots = { "http://www.cnn.com/robots.txt", "http://www.bleacherreport.com/robots.txt" }; Crawler = new WebCrawler(robots, Storage); Thread.Sleep(10000); string url = ""; while (true) { CloudQueueMessage stopMessage = StopQueue.GetMessage(); while (stopMessage == null) { // Get the next message CloudQueueMessage loadMessage = LoadQueue.GetMessage(); State = "Loading"; if (loadMessage != null) { State = "Loading"; url = loadMessage.AsString; if (url.Contains("robots.txt")) { string[] robotLinks = url.Split(null); foreach (string link in robotLinks) { Crawler.ProcessURL(link); } LoadQueue.DeleteMessage(loadMessage); } else { Crawler.ProcessURL(url); } } else if (State.Equals("Loading") || State.Equals("Crawling")) { CloudQueueMessage crawlMessage = CrawlQueue.GetMessage(); // dequeue crawl message if (crawlMessage != null) { State = "Crawling"; url = crawlMessage.AsString; Crawler.ProcessURL(url); CrawlQueue.DeleteMessage(crawlMessage); } } stopMessage = StopQueue.GetMessage(); UpdateDashboard(url); } State = "Idle"; } }