Пример #1
0
        private void ProcessXML(string URL)
        {
            HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(URL);

            httpRequest.Timeout   = 10000;   // 10 secs
            httpRequest.UserAgent = "Code Sample Web Client";

            HttpWebResponse webResponse = (HttpWebResponse)httpRequest.GetResponse();
            var             stream      = new StreamReader(webResponse.GetResponseStream());

            XmlDocument xmlDoc = new XmlDocument(); // Create an XML document object

            xmlDoc.Load(stream);

            // Get elements
            XmlNodeList elements = xmlDoc.GetElementsByTagName("sitemap");

            if (elements.Count == 0)
            {
                elements = xmlDoc.GetElementsByTagName("url");
            }

            for (int i = 0; i < elements.Count; i++)
            {
                var  link        = elements[i].ChildNodes[0].InnerText;
                bool correctDate = true;
                if (elements[i].LastChild.InnerText != link)
                {
                    var date = elements[i].ChildNodes[1].InnerText;
                    correctDate = CheckLinkIsRecent(link, date);
                }
                if (CheckLinkDomain(link) && CheckLinkIsCorrectType(link) && CheckIfAllowed(link) && correctDate)
                {
                    CloudQueueMessage linkMessage = new CloudQueueMessage(link);
                    if (link.EndsWith("xml"))
                    {
                        LoadQueue.AddMessage(linkMessage);
                    }
                    else
                    {
                        CrawlQueue.AddMessage(linkMessage);
                    }
                }
            }
        }
Пример #2
0
 public string StartCrawler()
 {
     StopQueue  = CloudConfiguration.GetStopQueue();
     StateQueue = CloudConfiguration.GetStateQueue();
     if (StopQueue.PeekMessage() == null)
     {
         LoadQueue = CloudConfiguration.GetLoadingQueue();
         CloudQueueMessage startMessage =
             new CloudQueueMessage("http://www.cnn.com/robots.txt http://www.bleacherreport.com/robots.txt");
         LoadQueue.AddMessage(startMessage);
         CloudQueueMessage state = new CloudQueueMessage("Loading");
     }
     else
     {
         StopQueue.DeleteMessage(StopQueue.GetMessage());
     }
     return("start crawler method executed");
 }
Пример #3
0
        private void ProcessTxt(string URL)
        {
            WebClient client = new WebClient();
            string    file   = client.DownloadString(URL);

            string[]         lines = file.Split('\n');
            HashSet <string> links = new HashSet <string>
                                         (lines.Where(x => x.ToLower().StartsWith("sitemap:"))
                                         .Select(x => x.Substring(x.IndexOf("http"))));

            foreach (string link in links)
            {
                if (CheckLinkDomain(link))
                {
                    CloudQueueMessage message = new CloudQueueMessage(link);
                    LoadQueue.AddMessage(message);
                }
            }
        }
Пример #4
0
        public string StartCrawler()
        {
            StopQueue = CloudConfiguration.GetStopQueue();
            CloudQueueMessage stopMessage = StopQueue.GetMessage();

            while (stopMessage != null)
            {
                StopQueue.DeleteMessage(stopMessage);
                stopMessage = StopQueue.GetMessage();
            }

            LoadQueue = CloudConfiguration.GetLoadingQueue();

            //Add message
            CloudQueueMessage cnnRobots = new CloudQueueMessage("http://www.cnn.com/robots.txt");

            LoadQueue.AddMessage(cnnRobots);

            CloudQueueMessage bleacherReportRobots = new CloudQueueMessage("http://www.bleacherreport.com/robots.txt");

            LoadQueue.AddMessage(bleacherReportRobots);

            return(LoadQueue.Name + " " + cnnRobots.AsString + " " + bleacherReportRobots.AsString);
        }