// Refresh storage and helpers. private void InitializeCrawlrComponents() { this._storageManager = new CrawlrStorageManager(ConfigurationManager.AppSettings["StorageConnectionString"]); this._crawlrData = new CrawlrDataHelper(); this._statusManager = new CrawlrStatusManager(); }
public void UpdateCrawlrStatus(string currentStatus, CrawlrDataHelper data, CrawlrStorageManager storage) { WorkerRoleStatus currStatusEntity; if (currentStatus == "CLEAR") { currStatusEntity = new WorkerRoleStatus( currentStatus, (int)_cpuTime.NextValue(), (int)_memoryFree.NextValue(), 0, new System.Collections.Generic.Queue <string>() ); } else { currStatusEntity = new WorkerRoleStatus( currentStatus, (int)_cpuTime.NextValue(), (int)_memoryFree.NextValue(), data.NumUrlsCrawled, data.LastTenUrls ); } TableOperation insertStatus = TableOperation.InsertOrReplace(currStatusEntity); storage.StatusTable.Execute(insertStatus); }
/// <summary> /// Checks to see if a given URL has already been queued/parsed; if not, adds to queue. /// </summary> /// <param name="currHref"> /// URL to check. /// </param> /// <param name="currUri"> /// Domain space of URL. /// </param> /// <param name="urlLastMod"> /// URLs lastmod/published date, if known. Nullable. /// </param> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> public static void ChkAndAddUrl(string currHref, string currUri, DateTime?urlLastMod, ref CrawlrDataHelper data, ref CrawlrStorageManager storage) { bool validDateIfExists = true; string domain = new Uri(currUri).Host; if (currHref.StartsWith(@"//")) { currHref = @"http:" + currHref; } else if (currHref.StartsWith(@"/")) { currHref = @"http://" + domain + currHref; } if (urlLastMod != null) { validDateIfExists = (urlLastMod >= DateTime.Now - TimeSpan.FromDays(62)); } if (IsInProperDomain(currHref) && !data.QueuedUrls.Contains(currHref) && !data.AddedUrls.Contains(currHref) && validDateIfExists) { CloudQueueMessage urlMsg = new CloudQueueMessage(currHref); storage.UrlQueue.AddMessage(urlMsg); data.QueuedUrls.Add(currHref); data.NumUrlsQueued++; } }
/// <summary> /// Crawls a given robots.txt, adding all sitemaps to queue. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> public static void CrawlRobotsTxt(ref CrawlrDataHelper data, ref CrawlrStorageManager storage) { string url = storage.GetCurrentRobotsTxt(); CrawlSpecificRobotsTxt(url, ref data, ref storage); // Include bleacherreport.com (formerly cnn.com/sports) if crawling cnn.com if (storage.GetCurrentRobotsTxt().Contains("cnn")) { CrawlSpecificRobotsTxt("http://www.bleacherreport.com/robots.txt", ref data, ref storage); } }
private static void CrawlSpecificRobotsTxt(string url, ref CrawlrDataHelper data, ref CrawlrStorageManager storage) { string tempPath = Path.GetTempFileName(); WebClient wc = new WebClient(); wc.DownloadFile(url, tempPath); StreamReader input = new StreamReader(tempPath); string currLine = ""; string currUserAgent = ""; List <string> sitemaps = new List <string>(); while ((currLine = input.ReadLine()) != null) { var splitLine = currLine.Split(' '); if (splitLine[0].ToLower() == "sitemap:") { bool pass = false; if (url.Contains("bleacherreport")) { if (splitLine[1].Contains("nba")) { pass = true; } } else { pass = true; } if (pass) { sitemaps.Add(splitLine[1]); data.QueuedXmls.Add(splitLine[1]); CloudQueueMessage msg = new CloudQueueMessage(splitLine[1]); storage.XmlQueue.AddMessage(msg); data.NumXmlsQueued++; } } else if (splitLine[0].ToLower() == "user-agent:") { currUserAgent = splitLine[1]; } else if (splitLine[0].ToLower() == "disallow:" && currUserAgent == "*") { data.DisallowedStrings.Add(splitLine[1]); } } }
/// <summary> /// Crawls a given XML document (sitemap) by URI, checking each found URI's date if possible /// and filtering out links disallowed or older than 2 months. Places found and valid XMLs/URLs /// into a relevant queue for crawling. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> /// <param name="xml"> /// URI to XML document (sitemap) to crawl. /// </param> public static void CrawlXml(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string xml) { if (data.ChkIfUriAllowed(xml)) { if (xml.EndsWith(".xml")) { XElement sitemap = XElement.Load(xml); string sitemapType = sitemap.Name.LocalName; string nameSpace = sitemap.GetDefaultNamespace().ToString(); string dateNameSpace = null; XName dateParent = null; XName date = null; if (sitemap.ToString().Contains(@"xmlns:news")) { dateNameSpace = sitemap.GetNamespaceOfPrefix("news").ToString(); dateParent = XName.Get("news", dateNameSpace); date = XName.Get("publication_date", dateNameSpace); } else if (sitemap.ToString().Contains(@"xmlns:video")) { dateNameSpace = sitemap.GetNamespaceOfPrefix("video").ToString(); dateParent = XName.Get("video", dateNameSpace); date = XName.Get("publication_date", dateNameSpace); } XName loc = XName.Get("loc", nameSpace); XName lastMod = XName.Get("lastmod", nameSpace); XName elementSelector; if (sitemapType == "sitemapindex") { elementSelector = XName.Get("sitemap", nameSpace); } else { elementSelector = XName.Get("url", nameSpace); } List <string> xmlsToQueue = new List <string>(); List <string> urlsToQueue = new List <string>(); foreach (var element in sitemap.Elements(elementSelector)) { bool validDateIfExists = true; var currLocElement = element.Element(loc); string currLocValue = currLocElement.Value; var currLastModElement = element.Element(lastMod); if (currLastModElement == null) { currLastModElement = element.Element(dateParent); currLastModElement = (currLastModElement == null ? null : currLastModElement.Element(date)); } if (currLastModElement != null) { validDateIfExists = DateTime.Parse(currLastModElement.Value) >= new DateTime(2018, 1, 1); } if (currLocValue.Contains(".xml")) { if (!data.QueuedXmls.Contains(currLocValue) && validDateIfExists) { xmlsToQueue.Add(currLocValue); } } else { if (!data.QueuedUrls.Contains(currLocValue) && validDateIfExists) { urlsToQueue.Add(currLocValue); } } } foreach (string newXml in xmlsToQueue) { data.QueuedXmls.Add(newXml); data.NumXmlsQueued++; data.XmlQueue.Enqueue(newXml); } foreach (string newUrl in urlsToQueue) { UrlCrawlr.ChkAndAddUrl(newUrl, xml, null, ref data, ref storage); } } } }
/// <summary> /// Crawls a given XML document (sitemap) by URI, checking each found URI's date if possible /// and filtering out links disallowed or older than 2 months. Places found and valid XMLs/URLs /// into relevant azure queue for processing. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> /// <param name="xml"> /// URI to XML document (sitemap) to crawl. /// </param> public static void CrawlXml(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string xml) { if (data.ChkIfUriAllowed(xml)) { try { if (xml.EndsWith(".xml")) { XElement sitemap = XElement.Load(xml); string sitemapType = sitemap.Name.LocalName; string nameSpace = sitemap.GetDefaultNamespace().ToString(); string dateNameSpace = null; XName dateParent = null; XName date = null; if (sitemap.ToString().Contains(@"xmlns:news")) { dateNameSpace = sitemap.GetNamespaceOfPrefix("news").ToString(); dateParent = XName.Get("news", dateNameSpace); date = XName.Get("publication_date", dateNameSpace); } else if (sitemap.ToString().Contains(@"xmlns:video")) { dateNameSpace = sitemap.GetNamespaceOfPrefix("video").ToString(); dateParent = XName.Get("video", dateNameSpace); date = XName.Get("publication_date", dateNameSpace); } XName loc = XName.Get("loc", nameSpace); XName lastMod = XName.Get("lastmod", nameSpace); XName elementSelector; if (sitemapType == "sitemapindex") { elementSelector = XName.Get("sitemap", nameSpace); } else { elementSelector = XName.Get("url", nameSpace); } List <string> xmlsToQueue = new List <string>(); List <string> urlsToQueue = new List <string>(); foreach (var element in sitemap.Elements(elementSelector)) { bool validDateIfExists = true; var currLocElement = element.Element(loc); string currLocValue = currLocElement.Value; var currLastModElement = element.Element(lastMod); if (currLastModElement == null) { currLastModElement = element.Element(dateParent); currLastModElement = (currLastModElement == null ? null : currLastModElement.Element(date)); } if (currLastModElement != null) { validDateIfExists = DateTime.Parse(currLastModElement.Value) >= DateTime.Now - TimeSpan.FromDays(62); } if (currLocValue.Contains(".xml")) { if (!data.QueuedXmls.Contains(currLocValue) && validDateIfExists) { xmlsToQueue.Add(currLocValue); } } else { if (!data.QueuedUrls.Contains(currLocValue) && validDateIfExists) { urlsToQueue.Add(currLocValue); } } } foreach (string newXml in xmlsToQueue) { CloudQueueMessage msg = new CloudQueueMessage(newXml); storage.XmlQueue.AddMessage(msg); data.QueuedXmls.Add(newXml); data.NumXmlsQueued++; } foreach (string newUrl in urlsToQueue) { UrlCrawlr.ChkAndAddUrl(newUrl, xml, null, ref data, ref storage); } } } catch (Exception ex) { ErrorUrl errorUrl = new ErrorUrl(xml, ex.ToString()); TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl); storage.ErrorTable.Execute(insertErrorUrl); } } }
/// <summary> /// Crawls a given URL, queueing all found URLs and storing information about /// the given URL for later querying. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> /// <param name="url"> /// The given URL to crawl. /// </param> public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url) { if (data.ChkIfUriAllowed(url)) { ///* Unsure if necessary. */ //if (!url.Contains(".htm")) //{ // if (!url.Contains(".jpg") && !url.Contains(".png")) // { // if (url.EndsWith(@"/")) // { // url += "index.html"; // } // else // { // url += @"/index.html"; // } // } //} try { var web = new HtmlWeb(); var currDoc = web.Load(url); var urlNodes = currDoc.DocumentNode.Descendants("a") .ToList(); var urlPageTitle = currDoc.DocumentNode.Descendants("title") .First() .InnerText; var urlLastModNode = currDoc.DocumentNode.Descendants("meta") .Select(y => y) .Where(y => y.Attributes.Contains("name")) .Where(y => y.Attributes["name"].Value == "pubdate") .ToList(); DateTime?urlLastMod = null; if (urlLastModNode.Count > 0) { urlLastMod = DateTime.Parse( urlLastModNode.First().Attributes["content"].Value); } List <string> urlsToQueue = new List <string>(); foreach (var urlNode in urlNodes) { if (urlNode.Attributes.Contains("href")) { urlsToQueue.Add(urlNode.Attributes["href"].Value); } } foreach (string newUrl in urlsToQueue) { ChkAndAddUrl(newUrl, url, urlLastMod, ref data, ref storage); } if (!data.AddedUrls.Contains(url)) { data.AddedUrls.Add(url); data.NumUrlsIndexed++; } data.NumUrlsCrawled++; FoundUrl finishedUrl = new FoundUrl(urlPageTitle, (urlLastMod != null ? urlLastMod.ToString() : "NULL"), url); UrlTableCount newCount = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed); TableOperation insertUrl = TableOperation.InsertOrReplace(finishedUrl); TableOperation insertCount = TableOperation.InsertOrReplace(newCount); storage.UrlTable.Execute(insertUrl); storage.UrlTable.Execute(insertCount); if (data.LastTenUrls.Count == 10) { data.LastTenUrls.Dequeue(); } data.LastTenUrls.Enqueue(url); } catch (Exception ex) { ErrorUrl errorUrl = new ErrorUrl(url, ex.ToString()); TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl); storage.ErrorTable.Execute(insertErrorUrl); } } }
/// <summary> /// Crawls a given URL, queueing all found URLs and storing information about /// the given URL for later querying. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> /// <param name="url"> /// The given URL to crawl. /// </param> public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url) { if (data.ChkIfUriAllowed(url)) { ///* Unsure if necessary. */ //if (!url.Contains(".htm")) //{ // if (!url.Contains(".jpg") && !url.Contains(".png")) // { // if (url.EndsWith(@"/")) // { // url += "index.html"; // } // else // { // url += @"/index.html"; // } // } //} var web = new HtmlWeb(); var currDoc = web.Load(url); var urlNodes = currDoc.DocumentNode.Descendants("a") .ToList(); var urlPageTitle = currDoc.DocumentNode.Descendants("title") .First() .InnerText; var urlLastModNode = currDoc.DocumentNode.Descendants("meta") .Select(y => y) .Where(y => y.Attributes.Contains("name")) .Where(y => y.Attributes["name"].Value == "pubdate") .ToList(); DateTime?urlLastMod = null; if (urlLastModNode.Count > 0) { urlLastMod = DateTime.Parse( urlLastModNode.First().Attributes["content"].Value); } List <string> urlsToQueue = new List <string>(); foreach (var urlNode in urlNodes) { if (urlNode.Attributes.Contains("href")) { urlsToQueue.Add(urlNode.Attributes["href"].Value); } } foreach (string newUrl in urlsToQueue) { ChkAndAddUrl(newUrl, url, null, ref data, ref storage); } if (!data.AddedUrls.Contains(url)) { data.AddedUrls.Add(url); } data.NumUrlsCrawled++; string[] splitPageTitle = urlPageTitle.Split(' '); foreach (string s in splitPageTitle) { string plainText = s.ToLower(); plainText = Regex.Replace(plainText, "[^a-zA-Z0-9]", ""); if (plainText != "") { IndexedUrl wordToUrl = new IndexedUrl(plainText, urlPageTitle, (urlLastMod != null ? urlLastMod.ToString(): "NULL"), url); TableOperation insertWordToUrl = TableOperation.InsertOrReplace(wordToUrl); storage.UrlTable.Execute(insertWordToUrl); data.NumUrlsIndexed++; } } UrlTableCount newCount = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed); TableOperation insertCount = TableOperation.InsertOrReplace(newCount); storage.UrlTable.Execute(insertCount); if (data.LastTenUrls.Count == 10) { data.LastTenUrls.Dequeue(); } data.LastTenUrls.Enqueue(url); } }