Пример #1
0
 public WebCrawler(List <string> CNNRules, List <string> BleacherReportRules)
 {
     LoadQueue                = CloudConfiguration.GetLoadingQueue();
     CrawlQueue               = CloudConfiguration.GetCrawlingQueue();
     Table                    = CloudConfiguration.GetTable();
     this.CNNRules            = CNNRules;
     this.BleacherReportRules = BleacherReportRules;
     this.VisitedLinks        = new HashSet <string>();
     OldestAllowed            = new DateTime(2016, 12, 1);
     BadExtensions            = new List <string> {
         ".jpg"
     };
 }
Пример #2
0
        public override void Run()
        {
            CloudQueue    LoadQueue           = CloudConfiguration.GetLoadingQueue();
            CloudQueue    CrawlQueue          = CloudConfiguration.GetCrawlingQueue();
            CloudQueue    StopQueue           = CloudConfiguration.GetStopQueue();
            CloudTable    Table               = CloudConfiguration.GetTable();
            List <string> CNNRules            = ProcessRobots("http://www.cnn.com/robots.txt");
            List <string> BleacherReportRules = ProcessRobots("http://www.bleacherreport.com/robots.txt");
            WebCrawler    Crawler             = new WebCrawler(CNNRules, BleacherReportRules);

            State = "Idle";
            Thread.Sleep(10000);

            CloudQueueMessage stopMessage = StopQueue.GetMessage();

            CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total");
            MemCount = new PerformanceCounter("Memory", "Available MBytes");

            while (true)
            {
                while (stopMessage == null)
                {
                    // Get the next message
                    CloudQueueMessage loadMessage = LoadQueue.GetMessage();

                    if (loadMessage != null)
                    {
                        State = "Loading";
                        string message = loadMessage.AsString;
                        Crawler.ProcessURL(message);
                        LoadQueue.DeleteMessage(loadMessage);
                    }
                    else if (State.Equals("Loading") || State.Equals("Crawling"))
                    {
                        CloudQueueMessage crawlMessage = CrawlQueue.GetMessage();
                        // dequeue crawl message
                        if (crawlMessage != null)
                        {
                            State = "Crawling";
                            Crawler.ProcessURL(crawlMessage.AsString);
                            CrawlQueue.DeleteMessage(crawlMessage);
                        }
                    }
                    stopMessage = StopQueue.GetMessage();
                }
                State = "Idle";
            }
        }
Пример #3
0
        private void ProcessHTML(string URL)
        {
            Table = CloudConfiguration.GetTable();

            WebRequest  myWebRequest  = WebRequest.Create(URL);
            WebResponse myWebResponse = myWebRequest.GetResponse();    // Returns a response from an Internet resource

            Stream streamResponse = myWebResponse.GetResponseStream(); // return the data stream from the internet and save it in the stream

            StreamReader reader  = new StreamReader(streamResponse);   // reads the data stream
            string       content = reader.ReadToEnd();                 // reads it to the end

            string title = GetTitle(content);

            URLEntity link = new URLEntity(URL, title, DateTime.Now);

            // Create the TableOperation object that inserts the customer entity.
            TableOperation insertOperation = TableOperation.Insert(link);

            // Execute the insert operation.
            Table.Execute(insertOperation);


            //Regex regexLink = new Regex("(?<=<a\\s*?href=(?:'|\"))[^'\"]*?(?=(?:'|\"))");
            //Regex regexURL = new Regex("href\\s*=\\s*\"(?<url>.*?)\"");
            //foreach (Match match in regexLink.Matches(content))
            //{
            //    string link = regexURL.Match(match.Value).Value;
            //    if (CheckLinkDomain(link) && CheckLinkIsCorrectType(link) && CheckIfAllowed(link))
            //    {
            //        CloudQueueMessage message = new CloudQueueMessage(link);
            //        CrawlQueue.AddMessage(message);
            //    }
            //}

            //Regex plzMatch = new Regex("");
            //foreach (Match m in plzMatch.Matches(content))
            //{
            //    string link = m.Value;
            //}
            streamResponse.Close();
            reader.Close();
            myWebResponse.Close();
        }
Пример #4
0
        public string GetPageTitle(string URL)
        {
            // Retrieve data from index (get page title for specific URL)
            Table = CloudConfiguration.GetTable();
            Table.CreateIfNotExists();

            string encodedURL = Base64.Base64Encode(URL);

            TableQuery <URLEntity> rangeQuery = new TableQuery <URLEntity>()
                                                .Where(TableQuery.GenerateFilterCondition("PartitionKey", QueryComparisons.Equal, encodedURL));

            var data = Table.ExecuteQuery(rangeQuery);

            if (data != null)
            {
                string title = data.First().Title;
                return(title);
            }
            else
            {
                return($"URl {URL} not found in table storage");
            }
        }