/// <summary> /// Parses a page looking for links. /// </summary> /// <param name="page">The page whose text is to be parsed.</param> /// <param name="sourceUrl">The source url of the page.</param> public void ParseLinks(Page page, string sourceUrl) { MatchCollection matches = Regex.Matches(page.Text, _LINK_REGEX); for (int i = 0; i <= matches.Count - 1; i++) { Match anchorMatch = matches[i]; if (anchorMatch.Value == String.Empty) { BadUrls.Add("Blank url value on page " + sourceUrl); continue; } string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception exc) { Exceptions.Add("Error parsing matched href: " + exc.Message); } if (!GoodUrls.Contains(foundHref)) { if (foundHref != "/") { if (IsExternalUrl(foundHref)) { _externalUrls.Add(foundHref); } else if (!IsAWebPage(foundHref)) { foundHref = Crawler.FixPath(sourceUrl, foundHref); _otherUrls.Add(foundHref); } else { GoodUrls.Add(foundHref); } } } } }
/// <summary> /// Parses a page looking for links. /// </summary> /// <param name="page">The page whose text is to be parsed.</param> /// <param name="sourceUrl">The source url of the page.</param> public void ParseLinks(InputSite inputSite, Page page, string sourceUrl) { if (sourceUrl.EndsWith(".xml")) { var matches = Regex.Matches(page.Text, _SITEMAP_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; var foundHref = BddJson.NormalizeUrl(anchorMatch.Value); // TODO faire un Regex Match foundHref = foundHref.Replace("<loc>", ""); foundHref = foundHref.Replace("</loc>", ""); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { GoodUrls.Add(foundHref); } } } else { var matches = Regex.Matches(page.Text, _LINK_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; if (anchorMatch.Value == string.Empty) { BadUrls.Add("Blank url value on page " + sourceUrl); continue; } string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception exc) { Exceptions.Add("Error parsing matched href: " + exc.Message); } foundHref = BddJson.NormalizeUrl(foundHref); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { if (IsExternalUrl(inputSite, foundHref)) { ExternalUrls.Add(foundHref); } else if (!IsAWebPage(foundHref)) { foundHref = Crawler.FixPath(inputSite, sourceUrl); OtherUrls.Add(foundHref); } else { GoodUrls.Add(foundHref); } } } } }