internal void AddExternalUrl(string href) { ExternalUrls.Add(href); }
public void crawlUrl(string urlToCrawl, List <string> parentWillCrawl = null) { Url aUrl = new Url() { Uri = urlToCrawl }; CrawledDomainUrlsAsUrlObjects.Add(aUrl); if (CrawledAllowedDomainUrls.Contains(urlToCrawl) || CrawledAllowedDomainUrls.Contains(urlToCrawl.Split('#').First())) { return; } CrawledAllowedDomainUrls.Add(urlToCrawl); //Load Page And Capture Time To Load DateTime preLoad = DateTime.Now; WebDriver.Navigate().GoToUrl(urlToCrawl); aUrl.MsToLoad = (DateTime.Now - preLoad).Milliseconds; //Take Screenshot of Page String basePath = SavePicturesToPath + aUrl.FileNameBase; int height = WebDriver.FindElement(By.TagName("body")).Size.Height; WebDriver.Manage().Window.Size = new Size(WebDriver.Manage().Window.Size.Width, height + 250); ((ITakesScreenshot)WebDriver).GetScreenshot().SaveAsFile(basePath + ".png", ScreenshotImageFormat.Png); aUrl.PicturePath = basePath + ".png"; aUrl.HtmlSourcePath = basePath + ".html"; File.WriteAllText(aUrl.HtmlSourcePath, WebDriver.PageSource); //Save Source Code of Page ReadOnlyCollection <IWebElement> links = WebDriver.FindElements(By.TagName("a")); List <String> urlsFoundOnThisPage = new List <string>(); foreach (var link in links) { if (link.GetAttribute("href") != null) { urlsFoundOnThisPage.Add(link.GetAttribute("href").Split('#').First()); } } UpdateProgressListeners(urlToCrawl, urlsFoundOnThisPage); //Generate list of known urls to pass to this method so that we don't crawl forever. List <String> parentsWillCrawl = new List <string>(); if (parentWillCrawl != null) { parentsWillCrawl.AddRange(parentWillCrawl); } parentsWillCrawl.AddRange(urlsFoundOnThisPage); foreach (var url in urlsFoundOnThisPage) { if (InAllowedDomain(url) && !CrawledAllowedDomainUrls.Contains(url) && (parentWillCrawl == null || !parentWillCrawl.Contains(url))) { crawlUrl(url, parentsWillCrawl); } else { ExternalUrls.Add(url); //If it is a url that is in the list of external urls to capture, go ahead. foreach (var captureExternalUrl in Config.CapturePagesLinkedInTheseDomains) { if (UrlsMatch(captureExternalUrl, url)) { crawlUrl(url, parentsWillCrawl); break; } } } } }
/// <summary> /// Parses a page looking for links. /// </summary> /// <param name="page">The page whose text is to be parsed.</param> /// <param name="sourceUrl">The source url of the page.</param> public void ParseLinks(InputSite inputSite, Page page, string sourceUrl) { if (sourceUrl.EndsWith(".xml")) { var matches = Regex.Matches(page.Text, _SITEMAP_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; var foundHref = BddJson.NormalizeUrl(anchorMatch.Value); // TODO faire un Regex Match foundHref = foundHref.Replace("<loc>", ""); foundHref = foundHref.Replace("</loc>", ""); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { GoodUrls.Add(foundHref); } } } else { var matches = Regex.Matches(page.Text, _LINK_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; if (anchorMatch.Value == string.Empty) { BadUrls.Add("Blank url value on page " + sourceUrl); continue; } string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception exc) { Exceptions.Add("Error parsing matched href: " + exc.Message); } foundHref = BddJson.NormalizeUrl(foundHref); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { if (IsExternalUrl(inputSite, foundHref)) { ExternalUrls.Add(foundHref); } else if (!IsAWebPage(foundHref)) { foundHref = Crawler.FixPath(inputSite, sourceUrl); OtherUrls.Add(foundHref); } else { GoodUrls.Add(foundHref); } } } } }