/// <summary> /// Parses a page looking for links. /// </summary> /// <param name="page">The page whose text is to be parsed.</param> /// <param name="sourceUrl">The source url of the page.</param> public void ParseLinks(Page page, string sourceUrl) { MatchCollection matches = Regex.Matches(page.Text, _LINK_REGEX); for (int i = 0; i <= matches.Count - 1; i++) { Match anchorMatch = matches[i]; if (anchorMatch.Value == String.Empty) { BadUrls.Add("Blank url value on page " + sourceUrl); continue; } string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception exc) { Exceptions.Add("Error parsing matched href: " + exc.Message); } if (!GoodUrls.Contains(foundHref)) { if (foundHref != "/") { if (IsExternalUrl(foundHref)) { _externalUrls.Add(foundHref); } else if (!IsAWebPage(foundHref)) { foundHref = Crawler.FixPath(sourceUrl, foundHref); _otherUrls.Add(foundHref); } else { GoodUrls.Add(foundHref); } } } } }