/// <summary> /// Crawls a page. /// </summary> /// <param name="url">The url to crawl.</param> private static void CrawlPage(string url) { if (!PageHasBeenCrawled(url)) { string htmlText = GetWebText(url); Page page = new Page(); page.Text = htmlText; page.Url = url; page.CalculateViewstateSize(); _pages.Add(page); LinkParser linkParser = new LinkParser(); linkParser.ParseLinks(page, url); CSSClassParser classParser = new CSSClassParser(); classParser.ParseForCssClasses(page); //Add data to main data lists AddRangeButNoDuplicates(_externalUrls, linkParser.ExternalUrls); AddRangeButNoDuplicates(_otherUrls, linkParser.OtherUrls); AddRangeButNoDuplicates(_failedUrls, linkParser.BadUrls); AddRangeButNoDuplicates(_classes, classParser.Classes); foreach (string exception in linkParser.Exceptions) _exceptions.Add(exception); //Crawl all the links found on the page. foreach (string link in linkParser.GoodUrls) { string formattedLink = link; try { formattedLink = FixPath(url, formattedLink); if (formattedLink != String.Empty) { CrawlPage(formattedLink); } } catch (Exception exc) { _failedUrls.Add(formattedLink + " (on page at url " + url + ") - " + exc.Message); } } } }
/// <summary> /// Parses the page looking for css classes that are in use. /// </summary> /// <param name="page">The page to parse.</param> public void ParseForCssClasses(Page page) { MatchCollection matches = Regex.Matches(page.Text, _CSS_CLASS_REGEX); for (int i = 0; i <= matches.Count - 1; i++) { Match classMatch = matches[i]; string[] classesArray = classMatch.Value.Substring(classMatch.Value.IndexOf('"') + 1, classMatch.Value.LastIndexOf('"') - classMatch.Value.IndexOf('"') - 1).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); foreach (string classValue in classesArray) { if (!_classes.Contains(classValue)) { _classes.Add(classValue); } } } }
/// <summary> /// Parses a page looking for links. /// </summary> /// <param name="page">The page whose text is to be parsed.</param> /// <param name="sourceUrl">The source url of the page.</param> public void ParseLinks(Page page, string sourceUrl) { MatchCollection matches = Regex.Matches(page.Text, _LINK_REGEX); for (int i = 0; i <= matches.Count - 1; i++) { Match anchorMatch = matches[i]; if (anchorMatch.Value == String.Empty) { BadUrls.Add("Blank url value on page " + sourceUrl); continue; } string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception exc) { Exceptions.Add("Error parsing matched href: " + exc.Message); } if (!GoodUrls.Contains(foundHref)) { if (IsExternalUrl(foundHref)) { _externalUrls.Add(foundHref); } else if (!IsAWebPage(foundHref)) { foundHref = Crawler.FixPath(sourceUrl, foundHref); _otherUrls.Add(foundHref); } else { GoodUrls.Add(foundHref); } } } }