/// <summary> /// Crawls a given URL, queueing all found URLs and storing information about /// the given URL for later querying. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> /// <param name="url"> /// The given URL to crawl. /// </param> public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url) { if (data.ChkIfUriAllowed(url)) { ///* Unsure if necessary. */ //if (!url.Contains(".htm")) //{ // if (!url.Contains(".jpg") && !url.Contains(".png")) // { // if (url.EndsWith(@"/")) // { // url += "index.html"; // } // else // { // url += @"/index.html"; // } // } //} try { var web = new HtmlWeb(); var currDoc = web.Load(url); var urlNodes = currDoc.DocumentNode.Descendants("a") .ToList(); var urlPageTitle = currDoc.DocumentNode.Descendants("title") .First() .InnerText; var urlLastModNode = currDoc.DocumentNode.Descendants("meta") .Select(y => y) .Where(y => y.Attributes.Contains("name")) .Where(y => y.Attributes["name"].Value == "pubdate") .ToList(); DateTime?urlLastMod = null; if (urlLastModNode.Count > 0) { urlLastMod = DateTime.Parse( urlLastModNode.First().Attributes["content"].Value); } List <string> urlsToQueue = new List <string>(); foreach (var urlNode in urlNodes) { if (urlNode.Attributes.Contains("href")) { urlsToQueue.Add(urlNode.Attributes["href"].Value); } } foreach (string newUrl in urlsToQueue) { ChkAndAddUrl(newUrl, url, urlLastMod, ref data, ref storage); } if (!data.AddedUrls.Contains(url)) { data.AddedUrls.Add(url); data.NumUrlsIndexed++; } data.NumUrlsCrawled++; FoundUrl finishedUrl = new FoundUrl(urlPageTitle, (urlLastMod != null ? urlLastMod.ToString() : "NULL"), url); UrlTableCount newCount = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed); TableOperation insertUrl = TableOperation.InsertOrReplace(finishedUrl); TableOperation insertCount = TableOperation.InsertOrReplace(newCount); storage.UrlTable.Execute(insertUrl); storage.UrlTable.Execute(insertCount); if (data.LastTenUrls.Count == 10) { data.LastTenUrls.Dequeue(); } data.LastTenUrls.Enqueue(url); } catch (Exception ex) { ErrorUrl errorUrl = new ErrorUrl(url, ex.ToString()); TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl); storage.ErrorTable.Execute(insertErrorUrl); } } }
/// <summary> /// Crawls a given URL, queueing all found URLs and storing information about /// the given URL for later querying. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> /// <param name="url"> /// The given URL to crawl. /// </param> public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url) { if (data.ChkIfUriAllowed(url)) { ///* Unsure if necessary. */ //if (!url.Contains(".htm")) //{ // if (!url.Contains(".jpg") && !url.Contains(".png")) // { // if (url.EndsWith(@"/")) // { // url += "index.html"; // } // else // { // url += @"/index.html"; // } // } //} var web = new HtmlWeb(); var currDoc = web.Load(url); var urlNodes = currDoc.DocumentNode.Descendants("a") .ToList(); var urlPageTitle = currDoc.DocumentNode.Descendants("title") .First() .InnerText; var urlLastModNode = currDoc.DocumentNode.Descendants("meta") .Select(y => y) .Where(y => y.Attributes.Contains("name")) .Where(y => y.Attributes["name"].Value == "pubdate") .ToList(); DateTime?urlLastMod = null; if (urlLastModNode.Count > 0) { urlLastMod = DateTime.Parse( urlLastModNode.First().Attributes["content"].Value); } List <string> urlsToQueue = new List <string>(); foreach (var urlNode in urlNodes) { if (urlNode.Attributes.Contains("href")) { urlsToQueue.Add(urlNode.Attributes["href"].Value); } } foreach (string newUrl in urlsToQueue) { ChkAndAddUrl(newUrl, url, null, ref data, ref storage); } if (!data.AddedUrls.Contains(url)) { data.AddedUrls.Add(url); } data.NumUrlsCrawled++; string[] splitPageTitle = urlPageTitle.Split(' '); foreach (string s in splitPageTitle) { string plainText = s.ToLower(); plainText = Regex.Replace(plainText, "[^a-zA-Z0-9]", ""); if (plainText != "") { IndexedUrl wordToUrl = new IndexedUrl(plainText, urlPageTitle, (urlLastMod != null ? urlLastMod.ToString(): "NULL"), url); TableOperation insertWordToUrl = TableOperation.InsertOrReplace(wordToUrl); storage.UrlTable.Execute(insertWordToUrl); data.NumUrlsIndexed++; } } UrlTableCount newCount = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed); TableOperation insertCount = TableOperation.InsertOrReplace(newCount); storage.UrlTable.Execute(insertCount); if (data.LastTenUrls.Count == 10) { data.LastTenUrls.Dequeue(); } data.LastTenUrls.Enqueue(url); } }