public override void Run() { Trace.TraceInformation("WorkerRole1 is running"); try { getReference g = new getReference(); CloudQueue cmd = g.commandQueue(); CloudTable table = g.getTable(); while (true) { CloudQueueMessage retrievedMessage = cmd.GetMessage(); crawledTable dashboard = new crawledTable("dash", null, null, null, null, null, "rowkey", 0, null, null, "idle"); TableOperation insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard); table.Execute(insertOrReplaceOperation1); if (retrievedMessage != null && retrievedMessage.AsString.Equals("start")) { cmd.DeleteMessage(retrievedMessage); crawlRobots(); } else if (retrievedMessage != null && retrievedMessage.AsString.Equals("stop")) { cmd.DeleteMessage(retrievedMessage); break; } } Thread.Sleep(50); this.RunAsync(this.cancellationTokenSource.Token).Wait(); } finally { this.runCompleteEvent.Set(); } }
public void deleteTable() { getReference g = new getReference(); CloudTable table = g.getTable(); table.Delete(); }
public String getStatus() { getReference g = new getReference(); CloudTable table = g.getTable(); TableOperation retrieveOperation = TableOperation.Retrieve <crawledTable>("dash", "rowkey"); // Execute the retrieve operation. TableResult retrievedResult = table.Execute(retrieveOperation); string value = ((crawledTable)retrievedResult.Result).status; return(value); }
public int tableSize() { getReference g = new getReference(); CloudTable table = g.getTable(); TableOperation retrieveOperation = TableOperation.Retrieve <crawledTable>("dash", "rowkey"); // Execute the retrieve operation. TableResult retrievedResult = table.Execute(retrieveOperation); int value = ((crawledTable)retrievedResult.Result).tableSize; return(value); }
public String search(String term) { getReference g = new getReference(); CloudTable table = g.getTable(); List <String> t = new List <String>(); String searchterm = EncodeUrlInKey(term); TableOperation retrieveOperation = TableOperation.Retrieve <crawledTable>("index", searchterm); // Execute the retrieve operation. TableResult retrievedResult = table.Execute(retrieveOperation); String value = ((crawledTable)retrievedResult.Result).title; return(value); }
public void stopCrawl() { getReference g = new getReference(); CloudQueue queue = g.commandQueue(); queue.CreateIfNotExists(); CloudQueueMessage message = new CloudQueueMessage("stop"); queue.AddMessage(message); CloudQueue storage = g.getQueue(); storage.Clear(); CloudTable table = g.getTable(); table.Delete(); }
public List <String> errors() { getReference g = new getReference(); CloudTable table = g.getTable(); List <String> t = new List <String>(); TableOperation retrieveOperation = TableOperation.Retrieve <crawledTable>("dash", "rowkey"); // Execute the retrieve operation. TableResult retrievedResult = table.Execute(retrieveOperation); String value = ((crawledTable)retrievedResult.Result).error; if (!String.IsNullOrEmpty(value) && value.Contains(',')) { string[] values = value.Split(','); for (int i = 0; i < values.Length; i++) { t.Add(values[i]); } } return(t); }
private HashSet <String> getAllHtml(List <String> o) { List <String> oldList = o; HashSet <String> htmlList = new HashSet <String>(); int count = 0; getReference g = new getReference(); CloudQueue queue = g.getQueue(); CloudTable table = g.getTable(); while (count < oldList.Count) { WebClient web = new WebClient(); String html = web.DownloadString(oldList.ElementAt(count)); MatchCollection m1 = Regex.Matches(html, @"<loc>\s*(.+?)\s*</loc>", RegexOptions.Singleline); String index = oldList.ElementAt(count); foreach (Match m in m1) { String url = m.Groups[1].Value; if (url.Contains("xml") && ((url.Contains("2015") || !url.Contains("-20")))) { oldList.Add(url); } if (!url.Contains("xml")) { crawledTable dashboard = new crawledTable("dash", null, null, null, null, null, "rowkey", 0, null, null, "loading"); TableOperation insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard); table.Execute(insertOrReplaceOperation1); CloudQueueMessage message = new CloudQueueMessage(url); queue.AddMessageAsync(message); htmlList.Add(url); } } count++; } return(htmlList); }
public void crawlerUrls(HashSet <String> duplicateList, List <String> noRobots) { List <String> lastten = new List <String>(); List <String> errorList = new List <String>(); getReference g = new getReference(); CloudQueue queue = g.getQueue(); CloudTable table = g.getTable(); CloudQueue cmd = g.commandQueue(); queue.FetchAttributes(); var limitCount = queue.ApproximateMessageCount.Value; int tableSize = 0; while (0 < limitCount) { CloudQueueMessage retrievedMessage = queue.GetMessage(); try { if (retrievedMessage != null) { HtmlWeb web = new HtmlWeb(); HtmlDocument document = web.Load(retrievedMessage.AsString); String title = ""; HtmlNode node = document.DocumentNode.SelectSingleNode("//title"); if (node != null) { HtmlAttribute desc; desc = node.Attributes["content"]; title = node.InnerHtml; } HtmlNode dateNode = document.DocumentNode.SelectSingleNode("//meta[(@itemprop='dateCreated')]"); String date = ""; if (dateNode != null) { date = dateNode.GetAttributeValue("content", ""); } String tenUrls = ""; lastten.Add(retrievedMessage.AsString); if (lastten.Count == 11) { lastten.RemoveAt(0); tenUrls = String.Join(",", lastten); } queue.DeleteMessage(retrievedMessage); String encodeUrl = EncodeUrlInKey(retrievedMessage.AsString); float memory = this.theMemCounter.NextValue(); float cpuUtilization = this.cpuload.NextValue(); tableSize++; String errors = String.Join(",", errorList); crawledTable ct = new crawledTable("index", retrievedMessage.AsString, title, date, null, null, encodeUrl, 0, null, null, null); crawledTable dashboard = new crawledTable("dash", retrievedMessage.AsString, title, date, errors, tenUrls, "rowkey", tableSize, memory.ToString(), cpuUtilization + "%", "crawling"); TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(ct); TableOperation insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard); table.Execute(insertOrReplaceOperation); table.Execute(insertOrReplaceOperation1); String root = ""; if (retrievedMessage.AsString.Contains("bleacher")) { root = "bleacherreport.com"; } else if (retrievedMessage.AsString.Contains("cnn")) { root = "cnn.com"; } var rows = document.DocumentNode.SelectNodes("//a[@href]"); if (rows != null && rows.Count > 0) { foreach (var link in rows) { String url = link.Attributes["href"].Value; if (url.StartsWith("//")) { url = "http:" + url; } else if (url.StartsWith("/")) { url = "http://" + root + url; } Boolean isAllowed = true; for (int i = 0; i < noRobots.Count; i++) { String disallowedUrl = noRobots[i]; if (url.Contains(disallowedUrl)) { isAllowed = false; break; } } if (!duplicateList.Contains(url) && isAllowed && (url.Contains(root + "/"))) { duplicateList.Add(url); CloudQueueMessage message = new CloudQueueMessage(url); queue.AddMessageAsync(message); } } } } } catch (Exception e) { string errorMessage = "Url: " + retrievedMessage.AsString + " problem is: " + e.Message; if (!errorList.Contains(errorMessage)) { errorList.Add(e.Message); } } queue.FetchAttributes(); limitCount = queue.ApproximateMessageCount.Value; } }
public void crawlerUrls(HashSet <String> duplicateList, List <String> noRobots) { HashSet <String> urlList = duplicateList; List <String> lastten = new List <String>(); getReference g = new getReference(); CloudQueue queue = g.getQueue(); CloudTable table = g.getTable(); CloudQueue cmd = g.commandQueue(); queue.FetchAttributes(); var limitCount = queue.ApproximateMessageCount.Value; int count = 0; while (0 < limitCount) { CloudQueueMessage retrievedMessage = queue.GetMessage(); try { if (retrievedMessage != null) { HtmlWeb web = new HtmlWeb(); HtmlDocument document = web.Load(retrievedMessage.AsString); String title = ""; HtmlNode node = document.DocumentNode.SelectSingleNode("//title"); if (node != null) { HtmlAttribute desc; desc = node.Attributes["content"]; title = node.InnerHtml; } HtmlNode dateNode = document.DocumentNode.SelectSingleNode("//meta[(@itemprop='dateCreated')]"); String date = ""; if (dateNode != null) { date = dateNode.GetAttributeValue("content", ""); } String tenUrls = ""; lastten.Add(retrievedMessage.AsString); if (lastten.Count == 11) { lastten.RemoveAt(0); tenUrls = String.Join(",", lastten); } queue.DeleteMessage(retrievedMessage); String encodeUrl = EncodeUrlInKey(retrievedMessage.AsString); crawledTable ct = new crawledTable("index", retrievedMessage.AsString, null, date, null, null, encodeUrl, 0, null, null, null); crawledTable dashboard = new crawledTable("dash", retrievedMessage.AsString, title, date, "DASHBOARD", tenUrls, "rowkey", 0, null, null, null); TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(ct); TableOperation insertOrReplaceOperation1 = TableOperation.InsertOrReplace(dashboard); table.Execute(insertOrReplaceOperation); table.Execute(insertOrReplaceOperation1); String root = ""; if (retrievedMessage.AsString.Contains("bleacher")) { root = "bleacherreport.com"; } else if (retrievedMessage.AsString.Contains("cnn")) { root = "cnn.com"; } var rows = document.DocumentNode.SelectNodes("//a[@href]"); if (rows != null && rows.Count > 0) { foreach (var link in rows) { String url = link.Attributes["href"].Value; if (url.StartsWith("//")) { url = "http:" + url; } else if (url.StartsWith("/")) { url = "http://" + root + url; } if (!urlList.Contains(url) && !noRobots.Contains(url) && (url.Contains(root + "/"))) { urlList.Add(url); CloudQueueMessage message = new CloudQueueMessage(url); queue.AddMessageAsync(message); } } } queue.FetchAttributes(); limitCount = queue.ApproximateMessageCount.Value; } } catch (WebException e) { queue.DeleteMessage(retrievedMessage); crawledTable ct = new crawledTable("error", retrievedMessage.AsString, "No Title", "No date", e.Status.ToString(), null, "error url", 0, null, null, null); TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(ct); table.Execute(insertOrReplaceOperation); } } }