예제 #1
0
 internal void AddExternalUrl(string href)
 {
     ExternalUrls.Add(href);
 }
예제 #2
0
        public void crawlUrl(string urlToCrawl, List <string> parentWillCrawl = null)
        {
            Url aUrl = new Url()
            {
                Uri = urlToCrawl
            };

            CrawledDomainUrlsAsUrlObjects.Add(aUrl);
            if (CrawledAllowedDomainUrls.Contains(urlToCrawl) || CrawledAllowedDomainUrls.Contains(urlToCrawl.Split('#').First()))
            {
                return;
            }

            CrawledAllowedDomainUrls.Add(urlToCrawl);

            //Load Page And Capture Time To Load
            DateTime preLoad = DateTime.Now;

            WebDriver.Navigate().GoToUrl(urlToCrawl);
            aUrl.MsToLoad = (DateTime.Now - preLoad).Milliseconds;

            //Take Screenshot of Page
            String basePath = SavePicturesToPath + aUrl.FileNameBase;
            int    height   = WebDriver.FindElement(By.TagName("body")).Size.Height;

            WebDriver.Manage().Window.Size = new Size(WebDriver.Manage().Window.Size.Width, height + 250);
            ((ITakesScreenshot)WebDriver).GetScreenshot().SaveAsFile(basePath + ".png",
                                                                     ScreenshotImageFormat.Png);
            aUrl.PicturePath    = basePath + ".png";
            aUrl.HtmlSourcePath = basePath + ".html";
            File.WriteAllText(aUrl.HtmlSourcePath, WebDriver.PageSource);


            //Save Source Code of Page
            ReadOnlyCollection <IWebElement> links = WebDriver.FindElements(By.TagName("a"));
            List <String> urlsFoundOnThisPage      = new List <string>();

            foreach (var link in links)
            {
                if (link.GetAttribute("href") != null)
                {
                    urlsFoundOnThisPage.Add(link.GetAttribute("href").Split('#').First());
                }
            }

            UpdateProgressListeners(urlToCrawl, urlsFoundOnThisPage);
            //Generate list of known urls to pass to this method so that we don't crawl forever.
            List <String> parentsWillCrawl = new List <string>();

            if (parentWillCrawl != null)
            {
                parentsWillCrawl.AddRange(parentWillCrawl);
            }

            parentsWillCrawl.AddRange(urlsFoundOnThisPage);

            foreach (var url in urlsFoundOnThisPage)
            {
                if (InAllowedDomain(url) && !CrawledAllowedDomainUrls.Contains(url) && (parentWillCrawl == null || !parentWillCrawl.Contains(url)))
                {
                    crawlUrl(url, parentsWillCrawl);
                }
                else
                {
                    ExternalUrls.Add(url);
                    //If it is a url that is in the list of external urls to capture, go ahead.
                    foreach (var captureExternalUrl in Config.CapturePagesLinkedInTheseDomains)
                    {
                        if (UrlsMatch(captureExternalUrl, url))
                        {
                            crawlUrl(url, parentsWillCrawl);
                            break;
                        }
                    }
                }
            }
        }
예제 #3
0
        /// <summary>
        ///     Parses a page looking for links.
        /// </summary>
        /// <param name="page">The page whose text is to be parsed.</param>
        /// <param name="sourceUrl">The source url of the page.</param>
        public void ParseLinks(InputSite inputSite, Page page, string sourceUrl)
        {
            if (sourceUrl.EndsWith(".xml"))
            {
                var matches = Regex.Matches(page.Text, _SITEMAP_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];
                    var foundHref   = BddJson.NormalizeUrl(anchorMatch.Value);
                    // TODO faire un Regex Match
                    foundHref = foundHref.Replace("<loc>", "");
                    foundHref = foundHref.Replace("</loc>", "");

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        GoodUrls.Add(foundHref);
                    }
                }
            }
            else
            {
                var matches = Regex.Matches(page.Text, _LINK_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];

                    if (anchorMatch.Value == string.Empty)
                    {
                        BadUrls.Add("Blank url value on page " + sourceUrl);
                        continue;
                    }

                    string foundHref = null;
                    try
                    {
                        foundHref = anchorMatch.Value.Replace("href=\"", "");
                        foundHref = foundHref.Substring(0, foundHref.IndexOf("\""));
                    }
                    catch (Exception exc)
                    {
                        Exceptions.Add("Error parsing matched href: " + exc.Message);
                    }

                    foundHref = BddJson.NormalizeUrl(foundHref);

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        if (IsExternalUrl(inputSite, foundHref))
                        {
                            ExternalUrls.Add(foundHref);
                        }
                        else if (!IsAWebPage(foundHref))
                        {
                            foundHref = Crawler.FixPath(inputSite, sourceUrl);
                            OtherUrls.Add(foundHref);
                        }
                        else
                        {
                            GoodUrls.Add(foundHref);
                        }
                    }
                }
            }
        }