/// <summary> /// Get every WebPage.Internal on a web site (or part of a web site) visiting all internal links just once /// plus every external page (or other Url) linked to the web site as a WebPage.External /// </summary> /// <remarks> /// Use .OfType WebPage.Internal to get just the internal ones if that's what you want /// </remarks> public static IEnumerable<WebPage> GetAllPagesUnder(Uri urlRoot) { int safetyCount = 0; var queue = new Queue<Uri>(); var allSiteUrls = new HashSet<Uri>(); queue.Enqueue(urlRoot); allSiteUrls.Add(urlRoot); while (queue.Count > 0 && safetyCount++<5) { Uri url = queue.Dequeue(); HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(url); oReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5"; HttpWebResponse resp = (HttpWebResponse)oReq.GetResponse(); WebPage result; if (resp.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase)) { HtmlDocument doc = new HtmlDocument(); try { var resultStream = resp.GetResponseStream(); doc.Load(resultStream); // The HtmlAgilityPack result = new Internal() { Url = url, HtmlDocument = doc }; } catch (System.Net.WebException ex) { result = new WebPage.Error() { Url = url, Exception = ex }; } catch (Exception ex) { ex.Data.Add("Url", url); // Annotate the exception with the Url throw; } // Success, hand off the page yield return new WebPage.Internal() { Url = url, HtmlDocument = doc }; // And and now queue up all the links on this page foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//a[@href]")) { HtmlAttribute att = link.Attributes["href"]; if (att == null) continue; string href = att.Value; if (href.StartsWith("javascript", StringComparison.InvariantCultureIgnoreCase)) continue; // ignore javascript on buttons using a tags Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute); // Make it absolute if it's relative if (!urlNext.IsAbsoluteUri) { urlNext = new Uri(urlRoot, urlNext); } if (!allSiteUrls.Contains(urlNext)) { allSiteUrls.Add(urlNext); // keep track of every page we've handed off if (urlRoot.IsBaseOf(urlNext)) { queue.Enqueue(urlNext); } else { yield return new WebPage.External() { Url = urlNext }; } } } } } }
/// <summary> /// Get every WebPage.Internal on a web site (or part of a web site) visiting all internal links just once /// plus every external page (or other Url) linked to the web site as a WebPage.External /// </summary> /// <remarks> /// Use .OfType WebPage.Internal to get just the internal ones if that's what you want /// </remarks> public static IEnumerable <WebPage> GetAllPagesUnder(Uri urlRoot) { int safetyCount = 0; var queue = new Queue <Uri>(); var allSiteUrls = new HashSet <Uri>(); queue.Enqueue(urlRoot); allSiteUrls.Add(urlRoot); while (queue.Count > 0 && safetyCount++ < 5) { Uri url = queue.Dequeue(); HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(url); oReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5"; HttpWebResponse resp = (HttpWebResponse)oReq.GetResponse(); WebPage result; if (resp.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase)) { HtmlDocument doc = new HtmlDocument(); try { var resultStream = resp.GetResponseStream(); doc.Load(resultStream); // The HtmlAgilityPack result = new Internal() { Url = url, HtmlDocument = doc }; } catch (System.Net.WebException ex) { result = new WebPage.Error() { Url = url, Exception = ex }; } catch (Exception ex) { ex.Data.Add("Url", url); // Annotate the exception with the Url throw; } // Success, hand off the page yield return(new WebPage.Internal() { Url = url, HtmlDocument = doc }); // And and now queue up all the links on this page foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//a[@href]")) { HtmlAttribute att = link.Attributes["href"]; if (att == null) { continue; } string href = att.Value; if (href.StartsWith("javascript", StringComparison.InvariantCultureIgnoreCase)) { continue; // ignore javascript on buttons using a tags } Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute); // Make it absolute if it's relative if (!urlNext.IsAbsoluteUri) { urlNext = new Uri(urlRoot, urlNext); } if (!allSiteUrls.Contains(urlNext)) { allSiteUrls.Add(urlNext); // keep track of every page we've handed off if (urlRoot.IsBaseOf(urlNext)) { queue.Enqueue(urlNext); } else { yield return(new WebPage.External() { Url = urlNext }); } } } } } }