public override void Run() { createQueues(); createTable(); initializeCounters(); while (checkAdminStatus() == "stopCrawl") { System.Threading.Thread.Sleep(1000); } visitedUrls = new HashSet <string>(); disallowList = new List <string>(); baseSitemap = new List <string>(); readRobots("http://www.cnn.com/robots.txt"); readRobots("http://bleacherreport.com/robots.txt"); while (true) { System.Threading.Thread.Sleep(50); if (checkAdminStatus() == "newCrawl") { createQueues(); createTable(); initializeCounters(); foreach (string baseSitemapUrl in baseSitemap) { parseXML(baseSitemapUrl); } TableOperation getAdminNode = TableOperation.Retrieve <adminNode>("admin", "command"); TableResult retrievedAdminNode = stattable.Execute(getAdminNode); adminNode newAdminNode = (adminNode)retrievedAdminNode.Result; if (newAdminNode != null) { newAdminNode.currentCommand = "resumeCrawl"; TableOperation updateAdminNode = TableOperation.Replace(newAdminNode); stattable.Execute(updateAdminNode); } } if (checkAdminStatus() == "resumeCrawl") { CloudQueueMessage currentXmlUrl = xmlqueue.GetMessage(); if (currentXmlUrl != null) { parseXML(currentXmlUrl.AsString); xmlqueue.DeleteMessage(currentXmlUrl); } try { CloudQueueMessage currentHtmlUrl = htmlqueue.GetMessage(); if (currentHtmlUrl != null) { crawlUrl(currentHtmlUrl.AsString); htmlqueue.DeleteMessage(currentHtmlUrl); } } catch (Exception e) { } } updatePerformanceCounter(); } }
/// <summary> /// Setup the table counters used for gathering statistics /// </summary> public void initializeCounters() { //Initialize row counter int currentRows = 0; TableOperation getRowCount = TableOperation.Retrieve <rowCount>("rowCount", "totalRows"); TableResult retrievedRowCount = stattable.Execute(getRowCount); if (retrievedRowCount.Result != null) { currentRows = (((rowCount)retrievedRowCount.Result).count); } rowCount countStart = new rowCount(currentRows); TableOperation insertCountStart = TableOperation.InsertOrReplace(countStart); stattable.Execute(insertCountStart); //Initialize crawler to stop adminNode stopCrawl = new adminNode("stopCrawl"); TableOperation insertAdminNode = TableOperation.InsertOrReplace(stopCrawl); stattable.Execute(insertAdminNode); //Initialize preformance counters performanceNode newPerformance = new performanceNode("None", "None"); TableOperation insertPerformanceNode = TableOperation.InsertOrReplace(newPerformance); stattable.Execute(insertPerformanceNode); }
public string resumeCrawl() { createTable(); TableOperation getAdminNode = TableOperation.Retrieve <adminNode>("admin", "command"); TableResult retrievedAdminNode = stattable.Execute(getAdminNode); adminNode newAdminNode = (adminNode)retrievedAdminNode.Result; if (newAdminNode != null) { newAdminNode.currentCommand = "resumeCrawl"; TableOperation updateAdminNode = TableOperation.Replace(newAdminNode); stattable.Execute(updateAdminNode); } return("Resuming crawl..."); }
/// <summary> /// Setup the table counters used for gathering statistics /// </summary> public void initializeCounters() { //Initialize url row counter int currentUrlRows = 0; rowCount urlCountStart = new rowCount(currentUrlRows); TableOperation insertUrlCountStart = TableOperation.InsertOrReplace(urlCountStart); stattable.Execute(insertUrlCountStart); //Initialize crawler to stop adminNode stopCrawl = new adminNode("stopCrawl"); TableOperation insertAdminNode = TableOperation.InsertOrReplace(stopCrawl); stattable.Execute(insertAdminNode); //Initialize preformance counters performanceNode newPerformance = new performanceNode("None", "None"); TableOperation insertPerformanceNode = TableOperation.InsertOrReplace(newPerformance); stattable.Execute(insertPerformanceNode); }