private void ParseHref(FlatPage page, Match href) { string url = href.Groups[1].Value; if (url.ToLower() == _rootUrl) { return; } if (url.StartsWith("#") || url.StartsWith("/#")) { return; } if (url.Contains(_domain) || url.StartsWith("/")) { if (!page.Links.ContainsKey(url)) { page.Links.Add(url, url); } } else { if (!page.ExternalLinks.ContainsKey(url)) { page.ExternalLinks.Add(url, url); } } }
private void ParseLinks(string pageContent, FlatPage page) { MatchCollection anchors = Regex.Matches(pageContent, AnchorRegex, RegexOptions.Singleline); foreach (Match anchor in anchors) { Match href = Regex.Match(anchor.Groups[1].Value, HrefRegex, RegexOptions.Singleline); if (href.Success) { ParseHref(page, href); } } }
private void CrawlWebPage(Page page, SiteDictionary siteDictionary) { Console.WriteLine(string.Format("Processing Page {0}", page.Url)); FlatPage flatPage = GenerateFlatPage(page.Url); MergeFlatPage(page, flatPage, siteDictionary); foreach (var link in page.InternalLinks) { if (!link.Visited) { CrawlWebPage(link, siteDictionary); } } }
public FlatPage Parse(string pageContent, string pageUrl) { if (string.IsNullOrWhiteSpace(_domain)) { throw new ArgumentException("Please invoke SetRootUrl before parsing"); } var newPage = new FlatPage { Url = pageUrl }; ParseLinks(pageContent, newPage); ParseImages(pageContent, newPage); return(newPage); }
private void ParseImages(string pageContent, FlatPage page) { MatchCollection images = Regex.Matches(pageContent, ImageRegex, RegexOptions.Singleline); foreach (Match image in images) { Match href = Regex.Match(image.Groups[1].Value, SrcRegex, RegexOptions.Singleline); if (href.Success) { string url = href.Groups[1].Value; if (!page.Images.ContainsKey(url)) { page.Images.Add(url, url); } } } }
private void MergeFlatPage(Page page, FlatPage flatPage, SiteDictionary siteDictionary) { foreach (var link in flatPage.Links) { string url = link.Value; Page existingPage; if (!siteDictionary.Links.TryGetValue(url, out existingPage)) { var newPage = new Page { Url = url }; siteDictionary.Links.Add(url, newPage); page.InternalLinks.Add(newPage); } else { page.InternalLinks.Add((Page)existingPage); } } foreach (var image in flatPage.Images) { string url = image.Value; Image existingImage; if (!siteDictionary.Images.TryGetValue(url, out existingImage)) { var newImage = new Image { Url = url }; siteDictionary.Images.Add(url, newImage); page.Images.Add(newImage); } else { page.Images.Add((Image)existingImage); } } foreach (var image in flatPage.ExternalLinks) { string url = image.Value; ExternalLink existingLink; if (!siteDictionary.ExternalLinks.TryGetValue(url, out existingLink)) { var newExternalLink = new ExternalLink { Url = url }; siteDictionary.ExternalLinks.Add(url, newExternalLink); page.ExternalLinks.Add(newExternalLink); } else { page.ExternalLinks.Add((ExternalLink)existingLink); } } page.Visited = true; }