Ejemplo n.º 1
0
 private void HandleRobotstxt(string message)
 {
     using (WebClient client = new WebClient())
     {
         StreamReader reader  = new StreamReader(client.OpenRead(message));
         Uri          mainUri = new Uri(message.Replace("/robots.txt", ""));
         while (!reader.EndOfStream)
         {
             String line = reader.ReadLine();
             if (line.Contains("Sitemap"))
             {
                 if (line.Contains("cnn.com") || line.Contains("/nba"))
                 {
                     line = line.Replace("Sitemap: ", "");
                     StorageManager.LinkQueue().AddMessageAsync(new CloudQueueMessage(line));
                 }
             }
             // if the line starts with "Disallow"
             else if (line.Contains("Disallow"))
             {
                 line = line.Replace("Disallow: ", "");
                 string disallowedLink = mainUri.OriginalString + line;
                 disallowedUrl.Add(disallowedLink);
             }
         }
     }
 }
Ejemplo n.º 2
0
        private void CrawlSiteMapIndex(string link)
        {
            string   cnn       = "http://www.sitemaps.org/schemas/sitemap/0.9";
            XElement sitemap   = XElement.Load(link);
            XName    sitemaps  = XName.Get("sitemap", cnn);
            XName    loc       = XName.Get("loc", cnn);
            XName    time      = XName.Get("lastmod", cnn);
            DateTime givendate = new DateTime(2017, 12, 1);
            DateTime publishDate;

            foreach (var sitemapElement in sitemap.Elements(sitemaps))
            {
                string locLink = sitemapElement.Element(loc).Value;
                publishDate = DateTime.Parse(sitemapElement.Element(time).Value);

                if (publishDate > givendate)
                {
                    StorageManager.LinkQueue().AddMessage(new CloudQueueMessage(locLink));
                }
            }
        }
Ejemplo n.º 3
0
        public void CrawlUrl()
        {
            new Task(GetPerfCounters).Start();
            Thread.Sleep(100);
            CloudQueueMessage linkMessage = StorageManager.LinkQueue().GetMessage();

            if (linkMessage != null)
            {
                string stringifiedLink = linkMessage.AsString;

                // if the message is robots.txt
                if (stringifiedLink.EndsWith("robots.txt"))
                {
                    HandleRobotstxt(stringifiedLink);
                    stringifiedLink = "";
                    StorageManager.LinkQueue().DeleteMessage(linkMessage);
                }
                // if the message is url.xml
                else
                {
                    // if the link contains more xml links
                    if (stringifiedLink.Contains("-index"))
                    {
                        CrawlSiteMapIndex(stringifiedLink);
                    }
                    // if the link contains html links
                    else
                    {
                        CrawlSiteMap(stringifiedLink);
                    }
                    StorageManager.LinkQueue().DeleteMessage(linkMessage);

                    if (StorageManager.LinkQueue().GetMessage() == null)
                    {
                        state = "Crawling";
                    }
                }
            }
        }