private async Task RunAsync(CancellationToken cancellationToken) { _storageManager = new CrawlrStorageManager(ConfigurationManager.AppSettings["StorageConnectionString"]); // Get the current cmd from the cmd table; // Re-execute cmd query periodically until current cmd exists while (_storageManager.GetCurrentCmd() != "START") { Thread.Sleep(5000); } // If start cmd given, initialize download of robots.txt // and populate the xmlQueue and _disallowed list if (_storageManager.GetCurrentCmd() == "START" && _crawlrData == null) { // Set up queues, tables, data helper, status helper InitializeCrawlrComponents(); Startup(); } // Recurring work while (!cancellationToken.IsCancellationRequested) { Trace.TraceInformation("Working"); // Do work if current cmd is still "start" if (_storageManager.GetCurrentCmd() == "START") { //Process all XMLs(sitemaps) found string nextXml = ""; try { while (_crawlrData.NumXmlsQueued > 0 && _storageManager.GetCurrentCmd() == "START") { //CloudQueueMessage nextXmlMsg = _storageManager.XmlQueue.GetMessage(); nextXml = _crawlrData.XmlQueue.Dequeue(); _crawlrData.NumXmlsQueued--; XmlCrawlr.CrawlXml(ref _crawlrData, ref _storageManager, nextXml); //_storageManager.XmlQueue.DeleteMessage(nextXmlMsg); // Update worker role status _statusManager.UpdateCrawlrStatus( "Loading", _crawlrData, _storageManager ); _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued); Thread.Sleep(50); } } catch (Exception ex) { try { ErrorEntity errorUrl = new ErrorEntity(nextXml, ex.ToString()); TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl); _storageManager.ErrorTable.Execute(insertErrorUrl); } catch (Exception) { } } // Process all URLs in queue string nextUrl = ""; try { while (_storageManager.GetCurrentCmd() == "START") { CloudQueueMessage nextUrlMsg = _storageManager.UrlQueue.GetMessage(); nextUrl = nextUrlMsg.AsString; UrlCrawlr.CrawlUrl(ref _crawlrData, ref _storageManager, nextUrl); _storageManager.UrlQueue.DeleteMessage(nextUrlMsg); _crawlrData.NumUrlsQueued--; // Update worker role status _statusManager.UpdateCrawlrStatus( "Crawling", _crawlrData, _storageManager ); _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued); Thread.Sleep(50); } } catch (Exception ex) { try { ErrorEntity errorUrl = new ErrorEntity(nextUrl, ex.ToString()); TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl); _storageManager.ErrorTable.Execute(insertErrorUrl); } catch (Exception) { } } } else if (_storageManager.GetCurrentCmd() == "CLEAR") { // If the "CLEAR" command is found, update status. _statusManager.UpdateCrawlrStatus( "CLEAR", _crawlrData, _storageManager ); _statusManager.UpdateQueueSize(_storageManager, 0, 0); _storageManager.UrlQueue.Clear(); _storageManager.XmlQueue.Clear(); // Give Azure time to delete tables. Thread.Sleep(20000); try { // Idle while waiting for next command. while (_storageManager.GetCurrentCmd() == "CLEAR") { Thread.Sleep(10000); } } finally { // Reinitialize worker role. InitializeCrawlrComponents(); Startup(); } } else { // Idle worker role (for unimplemented 'pause' functionality). _statusManager.UpdateCrawlrStatus( "Idle", _crawlrData, _storageManager ); Thread.Sleep(5000); } } Thread.Sleep(1000); }
private async Task RunAsync(CancellationToken cancellationToken) { // Set up queues, tables, data helper, status helper InitializeCrawlrComponents(); // Get the current cmd from the cmd table; // Re-execute cmd query periodically until current cmd exists while (_storageManager.GetCurrentCmd() != "START") { Thread.Sleep(5000); } // If start cmd given, initialize download of robots.txt // and populate the xmlQueue and _disallowed list if (_storageManager.GetCurrentCmd() == "START") { Startup(); } // Recurring work while (!cancellationToken.IsCancellationRequested) { Trace.TraceInformation("Working"); // Do work if current cmd is still "start" if (_storageManager.GetCurrentCmd() == "START") { // Process all XMLs (sitemaps) found while (_crawlrData.NumXmlsQueued > 0 && _storageManager.GetCurrentCmd() == "START") { CloudQueueMessage nextXmlMsg = _storageManager.XmlQueue.GetMessage(); string nextXml = nextXmlMsg.AsString; XmlCrawlr.CrawlXml(ref _crawlrData, ref _storageManager, nextXml); _storageManager.XmlQueue.DeleteMessage(nextXmlMsg); _crawlrData.NumXmlsQueued--; // Update worker role status _statusManager.UpdateCrawlrStatus( "Loading", _crawlrData, _storageManager ); _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued); Thread.Sleep(50); } // Process all URLs in queue while (_crawlrData.NumUrlsQueued > 0 && _storageManager.GetCurrentCmd() == "START") { CloudQueueMessage nextUrlMsg = _storageManager.UrlQueue.GetMessage(); string nextUrl = nextUrlMsg.AsString; UrlCrawlr.CrawlUrl(ref _crawlrData, ref _storageManager, nextUrl); _storageManager.UrlQueue.DeleteMessage(nextUrlMsg); _crawlrData.NumUrlsQueued--; // Update worker role status _statusManager.UpdateCrawlrStatus( "Crawling", _crawlrData, _storageManager ); _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued); Thread.Sleep(50); } } else if (_storageManager.GetCurrentCmd() == "CLEAR") { // If the "CLEAR" command is found, clear all queues and tables. _storageManager.ClearAll(); _statusManager.UpdateCrawlrStatus( "CLEAR", _crawlrData, _storageManager ); // Give Azure time to delete tables. Thread.Sleep(20000); try { // Idle while waiting for next command. while (_storageManager.GetCurrentCmd() == "CLEAR") { Thread.Sleep(10000); } } finally { // Reinitialize worker role. InitializeCrawlrComponents(); Startup(); } } else { // Idle worker role (for unimplemented 'pause' functionality). _statusManager.UpdateCrawlrStatus( "Idle", _crawlrData, _storageManager ); Thread.Sleep(5000); } } Thread.Sleep(1000); }