public WebCrawler(List <string> CNNRules, List <string> BleacherReportRules) { LoadQueue = CloudConfiguration.GetLoadingQueue(); CrawlQueue = CloudConfiguration.GetCrawlingQueue(); Table = CloudConfiguration.GetTable(); this.CNNRules = CNNRules; this.BleacherReportRules = BleacherReportRules; this.VisitedLinks = new HashSet <string>(); OldestAllowed = new DateTime(2016, 12, 1); BadExtensions = new List <string> { ".jpg" }; }
public override void Run() { CloudQueue LoadQueue = CloudConfiguration.GetLoadingQueue(); CloudQueue CrawlQueue = CloudConfiguration.GetCrawlingQueue(); CloudQueue StopQueue = CloudConfiguration.GetStopQueue(); CloudTable Table = CloudConfiguration.GetTable(); List <string> CNNRules = ProcessRobots("http://www.cnn.com/robots.txt"); List <string> BleacherReportRules = ProcessRobots("http://www.bleacherreport.com/robots.txt"); WebCrawler Crawler = new WebCrawler(CNNRules, BleacherReportRules); State = "Idle"; Thread.Sleep(10000); CloudQueueMessage stopMessage = StopQueue.GetMessage(); CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total"); MemCount = new PerformanceCounter("Memory", "Available MBytes"); while (true) { while (stopMessage == null) { // Get the next message CloudQueueMessage loadMessage = LoadQueue.GetMessage(); if (loadMessage != null) { State = "Loading"; string message = loadMessage.AsString; Crawler.ProcessURL(message); LoadQueue.DeleteMessage(loadMessage); } else if (State.Equals("Loading") || State.Equals("Crawling")) { CloudQueueMessage crawlMessage = CrawlQueue.GetMessage(); // dequeue crawl message if (crawlMessage != null) { State = "Crawling"; Crawler.ProcessURL(crawlMessage.AsString); CrawlQueue.DeleteMessage(crawlMessage); } } stopMessage = StopQueue.GetMessage(); } State = "Idle"; } }
private void ProcessHTML(string URL) { Table = CloudConfiguration.GetTable(); WebRequest myWebRequest = WebRequest.Create(URL); WebResponse myWebResponse = myWebRequest.GetResponse(); // Returns a response from an Internet resource Stream streamResponse = myWebResponse.GetResponseStream(); // return the data stream from the internet and save it in the stream StreamReader reader = new StreamReader(streamResponse); // reads the data stream string content = reader.ReadToEnd(); // reads it to the end string title = GetTitle(content); URLEntity link = new URLEntity(URL, title, DateTime.Now); // Create the TableOperation object that inserts the customer entity. TableOperation insertOperation = TableOperation.Insert(link); // Execute the insert operation. Table.Execute(insertOperation); //Regex regexLink = new Regex("(?<=<a\\s*?href=(?:'|\"))[^'\"]*?(?=(?:'|\"))"); //Regex regexURL = new Regex("href\\s*=\\s*\"(?<url>.*?)\""); //foreach (Match match in regexLink.Matches(content)) //{ // string link = regexURL.Match(match.Value).Value; // if (CheckLinkDomain(link) && CheckLinkIsCorrectType(link) && CheckIfAllowed(link)) // { // CloudQueueMessage message = new CloudQueueMessage(link); // CrawlQueue.AddMessage(message); // } //} //Regex plzMatch = new Regex(""); //foreach (Match m in plzMatch.Matches(content)) //{ // string link = m.Value; //} streamResponse.Close(); reader.Close(); myWebResponse.Close(); }
public string GetPageTitle(string URL) { // Retrieve data from index (get page title for specific URL) Table = CloudConfiguration.GetTable(); Table.CreateIfNotExists(); string encodedURL = Base64.Base64Encode(URL); TableQuery <URLEntity> rangeQuery = new TableQuery <URLEntity>() .Where(TableQuery.GenerateFilterCondition("PartitionKey", QueryComparisons.Equal, encodedURL)); var data = Table.ExecuteQuery(rangeQuery); if (data != null) { string title = data.First().Title; return(title); } else { return($"URl {URL} not found in table storage"); } }