private void HandleRobotstxt(string message) { using (WebClient client = new WebClient()) { StreamReader reader = new StreamReader(client.OpenRead(message)); Uri mainUri = new Uri(message.Replace("/robots.txt", "")); while (!reader.EndOfStream) { String line = reader.ReadLine(); if (line.Contains("Sitemap")) { if (line.Contains("cnn.com") || line.Contains("/nba")) { line = line.Replace("Sitemap: ", ""); StorageManager.LinkQueue().AddMessageAsync(new CloudQueueMessage(line)); } } // if the line starts with "Disallow" else if (line.Contains("Disallow")) { line = line.Replace("Disallow: ", ""); string disallowedLink = mainUri.OriginalString + line; disallowedUrl.Add(disallowedLink); } } } }
private void CrawlSiteMapIndex(string link) { string cnn = "http://www.sitemaps.org/schemas/sitemap/0.9"; XElement sitemap = XElement.Load(link); XName sitemaps = XName.Get("sitemap", cnn); XName loc = XName.Get("loc", cnn); XName time = XName.Get("lastmod", cnn); DateTime givendate = new DateTime(2017, 12, 1); DateTime publishDate; foreach (var sitemapElement in sitemap.Elements(sitemaps)) { string locLink = sitemapElement.Element(loc).Value; publishDate = DateTime.Parse(sitemapElement.Element(time).Value); if (publishDate > givendate) { StorageManager.LinkQueue().AddMessage(new CloudQueueMessage(locLink)); } } }
public void CrawlUrl() { new Task(GetPerfCounters).Start(); Thread.Sleep(100); CloudQueueMessage linkMessage = StorageManager.LinkQueue().GetMessage(); if (linkMessage != null) { string stringifiedLink = linkMessage.AsString; // if the message is robots.txt if (stringifiedLink.EndsWith("robots.txt")) { HandleRobotstxt(stringifiedLink); stringifiedLink = ""; StorageManager.LinkQueue().DeleteMessage(linkMessage); } // if the message is url.xml else { // if the link contains more xml links if (stringifiedLink.Contains("-index")) { CrawlSiteMapIndex(stringifiedLink); } // if the link contains html links else { CrawlSiteMap(stringifiedLink); } StorageManager.LinkQueue().DeleteMessage(linkMessage); if (StorageManager.LinkQueue().GetMessage() == null) { state = "Crawling"; } } } }