private void CrawlWebPage(Page page, SiteDictionary siteDictionary) { Console.WriteLine(string.Format("Processing Page {0}", page.Url)); FlatPage flatPage = GenerateFlatPage(page.Url); MergeFlatPage(page, flatPage, siteDictionary); foreach (var link in page.InternalLinks) { if (!link.Visited) { CrawlWebPage(link, siteDictionary); } } }
public SiteMap CrawlWebSite(string url) { var root = new Page { Url = url }; var siteDictionary = new SiteDictionary(); _webPageParser.SetRootUrl(url); CrawlWebPage(root, siteDictionary); return new SiteMap { Root = root, TotalExternalLinks = siteDictionary.ExternalLinks.Count, TotalImages = siteDictionary.Images.Count, TotalInternalLinks = siteDictionary.Links.Count }; }
private void MergeFlatPage(Page page, FlatPage flatPage, SiteDictionary siteDictionary) { foreach (var link in flatPage.Links) { string url = link.Value; Page existingPage; if (!siteDictionary.Links.TryGetValue(url, out existingPage)) { var newPage = new Page { Url = url }; siteDictionary.Links.Add(url, newPage); page.InternalLinks.Add(newPage); } else { page.InternalLinks.Add((Page)existingPage); } } foreach (var image in flatPage.Images) { string url = image.Value; Image existingImage; if (!siteDictionary.Images.TryGetValue(url, out existingImage)) { var newImage = new Image { Url = url }; siteDictionary.Images.Add(url, newImage); page.Images.Add(newImage); } else { page.Images.Add((Image)existingImage); } } foreach (var image in flatPage.ExternalLinks) { string url = image.Value; ExternalLink existingLink; if (!siteDictionary.ExternalLinks.TryGetValue(url, out existingLink)) { var newExternalLink = new ExternalLink { Url = url }; siteDictionary.ExternalLinks.Add(url, newExternalLink); page.ExternalLinks.Add(newExternalLink); } else { page.ExternalLinks.Add((ExternalLink)existingLink); } } page.Visited = true; }