public override void Run() { Trace.TraceInformation("WorkerRole::Run(): started"); webCrawler = new Crawler( Azure.GetInstance().getQueueReference(ConfigurationManager.AppSettings["urlQ"]), Azure.GetInstance().getTableReference(ConfigurationManager.AppSettings["pageInfo"]), Azure.GetInstance().getTableReference(ConfigurationManager.AppSettings["errorInfo"]) ); updateState(WorkerInfo.State.Stopped); while (true) { CMsg commandMsg = CommandQueue.GetInstance().getMessage(); if (commandMsg != null) { switch (commandMsg.command) { case Command.AddDomain: webCrawler.addAuthority((new Urls(commandMsg.str).getAuthority())); break; case Command.Start: updateState(WorkerInfo.State.Loading, true); webCrawler.start(); break; case Command.Stop: updateState(WorkerInfo.State.Stopping, true); webCrawler.clearAuthorities(); webCrawler.stop(true); updateState(WorkerInfo.State.Stopped, true); break; case Command.Reset: updateState(WorkerInfo.State.Stopping, true); webCrawler.stop(true); updateState(WorkerInfo.State.Resetting, true); webCrawler.resetDatabase(); updateState(WorkerInfo.State.Reset, true); break; } CommandQueue.GetInstance().removeMessage(commandMsg); } if (!webCrawler.stopped()) { updateState(WorkerInfo.State.Crawling); } WorkerMonitor.GetInstance().updateProfile(currentState); } }
public override void Run() { CloudStorageAccount storageAccount = CloudStorageAccount.Parse(ConfigurationManager.AppSettings["StorageConnectionString"]); CloudTableClient tableClient = storageAccount.CreateCloudTableClient(); datatable = tableClient.GetTableReference("datatable"); datatable.CreateIfNotExists(); CloudQueueClient queueClient = storageAccount.CreateCloudQueueClient(); urlQueue = queueClient.GetQueueReference("urlqueue"); urlQueue.CreateIfNotExists(); adminQueue = queueClient.GetQueueReference("adminqueue"); adminQueue.CreateIfNotExists(); ramCounter = new System.Diagnostics.PerformanceCounter("Memory", "Available MBytes"); cpuCounter = new System.Diagnostics.PerformanceCounter(); cpuCounter.CategoryName = "Processor"; cpuCounter.CounterName = "% Processor Time"; cpuCounter.InstanceName = "_Total"; crawler = new Crawler(); while (true) { Thread.Sleep(50); //If there is something in the admin queue if (adminQueue.PeekMessage() != null) { CloudQueueMessage message = adminQueue.GetMessage(); crawler.handleAdminMessage(message); adminQueue.DeleteMessage(message); } if (urlQueue.PeekMessage() != null) { crawler.crawlingPhase(urlQueue.GetMessage()); } } }
public override void Run() { //put visited urls in a hash set startingCode(); while (true) { Trace.TraceInformation("Working"); //Sleep 50ms Thread.Sleep(50); //Check and handle admin messages CloudQueueMessage startStopMessage = startStopQueue.GetMessage(); if (startStopMessage != null) { state = startStopMessage.AsString; Update("info", "state", state); startStopQueue.DeleteMessage(startStopMessage); if (state.Equals("started")) { crawl = new Crawler(); crawl.StartLoader(); } } if (state.Equals("clearing")){ //clear queue queue.Clear(); startingCode(); } else if (state.Equals("started")) //keepGoing { //get message from url queue CloudQueueMessage message = queue.GetMessage(); //if message isn't null if (message != null) { urlsCrawled++; Update("info", "total", urlsCrawled.ToString()); queue.FetchAttributes(); queueSize = (int)queue.ApproximateMessageCount; Update("info", "queue", queueSize.ToString()); string url = message.AsString; if (!acceptedURLs.Contains(url)) { try { List<WebCrawlerEntity> entities = crawl.startCrawler(url); //Store dates if (entities != null) { numTitles++; Update("info", "numTitles", numTitles.ToString()); string temp = ""; foreach (WebCrawlerEntity w in entities) { temp += " " + w.PartitionKey; urlsAccepted++; w.num = urlsAccepted; Update("info", "accepted", urlsAccepted.ToString()); TableOperation insertOperation = TableOperation.InsertOrReplace(w); table.ExecuteAsync(insertOperation); acceptedURLs.Add(url); } TextInfo myTI = new CultureInfo("en-US", false).TextInfo; Update("info", "lastTitle", myTI.ToTitleCase(temp)); } } catch (Exception e) { //put errors in error table numErrors++; infoEntity newError = new infoEntity(numErrors.ToString(), "url: " + url + " Error: " + e.Message); TableOperation insertErrorOperation = TableOperation.InsertOrReplace(newError); errorTable.Execute(insertErrorOperation); } } queue.DeleteMessage(message); } } } }
public override void Run() { Trace.TraceInformation("WorkerRole1 is running"); CloudStorageAccount storageAccount = CloudStorageAccount.Parse(ConfigurationManager.AppSettings["StorageConnectionString"]); CloudTableClient tableClient = storageAccount.CreateCloudTableClient(); CloudQueueClient queueClient = storageAccount.CreateCloudQueueClient(); table = tableClient.GetTableReference("htmlURLs"); HTMLs = queueClient.GetQueueReference("urls"); messages = queueClient.GetQueueReference("status"); visitedURLs = queueClient.GetQueueReference("visited"); bool robotsparsed = false; CloudQueueMessage status = null; lastTen = new List <String>(); while (true) { Thread.Sleep(10); if (messages.Exists()) { status = messages.PeekMessage(); } if (status != null && status.AsString.Equals("Start") && HTMLs != null && !robotsparsed) { spider = new Crawler(); List <string> urls = spider.crawlRobots(); List <string> urls2 = spider.cnnRobotsCrawl(); index = 0; urlsCrawled = 0; foreach (string url in urls) { HTMLs.AddMessage(new CloudQueueMessage(url)); } foreach (string url in urls2) { HTMLs.AddMessage(new CloudQueueMessage(url)); } robotsparsed = true; messages.DeleteMessage(messages.GetMessage(TimeSpan.FromMinutes(5))); messages.AddMessage(new CloudQueueMessage("Crawling")); } else if (status != null && !status.AsString.Equals("Stop") && HTMLs != null && robotsparsed) { CloudQueueMessage message = HTMLs.GetMessage(TimeSpan.FromMinutes(5)); if (message != null) { HTMLs.DeleteMessage(message); string messageString = message.AsString; if (spider.isAllowed(messageString)) { List <string> newLinks = spider.crawlLink(messageString); URL entry = null; if (newLinks[0] == "Error") { entry = new URL(messageString, newLinks[0], "Error"); } else { entry = new URL(messageString, newLinks[0], "Partition"); } if (lastTen.Count < 10) { lastTen.Add(messageString); } else { lastTen.Remove(lastTen[0]); lastTen.Add(messageString); } if (spider.visited().Contains(messageString)) { index++; } urlsCrawled++; TableOperation insertOperation = TableOperation.InsertOrReplace(entry); table.ExecuteAsync(insertOperation); //var result = table.BeginExecute(insertOperation, //new AsyncCallback(onTableExecuteComplete), entity); //result.AsyncWaitHandle.WaitOne(); if (newLinks.Count > 1) { for (int i = 2; i < newLinks.Count - 1; i++) { HTMLs.AddMessageAsync(new CloudQueueMessage(newLinks[i])); } } } URL tableEntry = new URL("Index", index.ToString(), "IndexCount"); TableOperation indexOperation = TableOperation.InsertOrReplace(tableEntry); table.Execute(indexOperation); URL tableEntry2 = new URL("urls", urlsCrawled.ToString(), "URLs"); TableOperation urlCountOperation = TableOperation.InsertOrReplace(tableEntry2); table.Execute(urlCountOperation); } } } // try // { // this.RunAsync(this.cancellationTokenSource.Token).Wait(); // } // finally // { // this.runCompleteEvent.Set(); // } }