private void ProcessXML(string URL) { HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(URL); httpRequest.Timeout = 10000; // 10 secs httpRequest.UserAgent = "Code Sample Web Client"; HttpWebResponse webResponse = (HttpWebResponse)httpRequest.GetResponse(); var stream = new StreamReader(webResponse.GetResponseStream()); XmlDocument xmlDoc = new XmlDocument(); // Create an XML document object xmlDoc.Load(stream); // Get elements XmlNodeList elements = xmlDoc.GetElementsByTagName("sitemap"); if (elements.Count == 0) { elements = xmlDoc.GetElementsByTagName("url"); } for (int i = 0; i < elements.Count; i++) { var link = elements[i].ChildNodes[0].InnerText; bool correctDate = true; if (elements[i].LastChild.InnerText != link) { var date = elements[i].ChildNodes[1].InnerText; correctDate = CheckLinkIsRecent(link, date); } if (CheckLinkDomain(link) && CheckLinkIsCorrectType(link) && CheckIfAllowed(link) && correctDate) { CloudQueueMessage linkMessage = new CloudQueueMessage(link); if (link.EndsWith("xml")) { LoadQueue.AddMessage(linkMessage); } else { CrawlQueue.AddMessage(linkMessage); } } } }
public string StartCrawler() { StopQueue = CloudConfiguration.GetStopQueue(); StateQueue = CloudConfiguration.GetStateQueue(); if (StopQueue.PeekMessage() == null) { LoadQueue = CloudConfiguration.GetLoadingQueue(); CloudQueueMessage startMessage = new CloudQueueMessage("http://www.cnn.com/robots.txt http://www.bleacherreport.com/robots.txt"); LoadQueue.AddMessage(startMessage); CloudQueueMessage state = new CloudQueueMessage("Loading"); } else { StopQueue.DeleteMessage(StopQueue.GetMessage()); } return("start crawler method executed"); }
private void ProcessTxt(string URL) { WebClient client = new WebClient(); string file = client.DownloadString(URL); string[] lines = file.Split('\n'); HashSet <string> links = new HashSet <string> (lines.Where(x => x.ToLower().StartsWith("sitemap:")) .Select(x => x.Substring(x.IndexOf("http")))); foreach (string link in links) { if (CheckLinkDomain(link)) { CloudQueueMessage message = new CloudQueueMessage(link); LoadQueue.AddMessage(message); } } }
public string StartCrawler() { StopQueue = CloudConfiguration.GetStopQueue(); CloudQueueMessage stopMessage = StopQueue.GetMessage(); while (stopMessage != null) { StopQueue.DeleteMessage(stopMessage); stopMessage = StopQueue.GetMessage(); } LoadQueue = CloudConfiguration.GetLoadingQueue(); //Add message CloudQueueMessage cnnRobots = new CloudQueueMessage("http://www.cnn.com/robots.txt"); LoadQueue.AddMessage(cnnRobots); CloudQueueMessage bleacherReportRobots = new CloudQueueMessage("http://www.bleacherreport.com/robots.txt"); LoadQueue.AddMessage(bleacherReportRobots); return(LoadQueue.Name + " " + cnnRobots.AsString + " " + bleacherReportRobots.AsString); }