Пример #1
0
        public CrawlResult CrawlPage(CrawlPlan plan)
        {
            var crawlResult = new CrawlResult(plan.AbsoluteUri);


            string driverPageSource;

            //using (IWebDriver driver = new ChromeDriver(new ChromeOptions() { }))
            //{
            //WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(10));

            try
            {
                //todo: disable redirect
                driver.Navigate().GoToUrl(plan.AbsoluteUri);

                if (_pageLoadWait > 0)
                {
                    Thread.Sleep(TimeSpan.FromSeconds(_pageLoadWait));
                }

                driverPageSource = driver.PageSource;
            }
            catch (Exception e)
            {
                Console.WriteLine(e);

                crawlResult.BrowserFailedAt        = DateTime.UtcNow;
                crawlResult.BrowserFailedException = e.ToString();

                //throw;

                return(crawlResult);
            }



            //string html = driver.ExecuteJavaScript<string>("return document.documentElement.outerHTML;");


            //driver.FindElement(By.Name("q")).SendKeys("cheese" + Keys.Enter);
            //IWebElement firstResult = wait.Until(ExpectedConditions.ElementExists(By.TagName("h1")));
            //Console.WriteLine(firstResult.GetAttribute("textContent"));
            //}



            //crawlResult.Doc = doc;
            crawlResult.BrowserCrawledAt = DateTime.UtcNow;
            crawlResult.BrowserContent   = driverPageSource;



            var linkSets = new List <List <string> >();

            //find links
            var linkElements = driver.FindElements(By.XPath("//a[@href]"));

            if (linkElements != null && linkElements.Count > 0)
            {
                linkSets.Add(linkElements.Select(o => o.GetAttribute("href")).ToList());
            }

            //------------------------------------------
            //Some Webpages DO NOT show all elements unless interacted with e.g. hovering)
            //try hovering and findind more contents
            //------------------------------------------
            var hoverableList = new string[] {
                //"//p[.='Creators']", "//p[.='Learn more']", "//p[.='Crypto Community']"
            };

            if (hoverableList != null && hoverableList.Length > 0)
            {
                var action = new Actions(driver);
                foreach (var hoverableXPath in hoverableList)
                {
                    var hoverableElement = driver.FindElement(By.XPath(hoverableXPath));
                    if (hoverableElement != null)
                    {
                        action.MoveToElement(hoverableElement).Perform();
                        linkElements = driver.FindElements(By.XPath("//a[@href]"));
                        if (linkElements != null && linkElements.Count > 0)
                        {
                            linkSets.Add(linkElements.Select(o => o.GetAttribute("href")).ToList());
                        }
                    }
                }
            }

            var linksToSave = new List <CrawledLink>();

            foreach (var linkSet in linkSets)
            {
                //Console.WriteLine($"\tfound {links.Count} child links.");
                foreach (var href in linkSet)
                {
                    //var href = link.GetAttribute("href");
                    //if(href.Contains("fortnite-stats/")) Debugger.Break();

                    var decoded = HttpUtility.HtmlDecode(href);
                    //if (decoded != href)
                    //    Debugger.Break();

                    if (decoded == "" || decoded.StartsWith("javascript:") || decoded.StartsWith("mailto:") || decoded.StartsWith("skype:"))
                    {
                        continue;
                    }

                    try
                    {
                        var childUri = Util.GetUriObjectFromUriString(decoded, plan.AbsoluteUri);

                        //no duplicated links
                        if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri))
                        {
                            linksToSave.Add(new CrawledLink()
                            {
                                AbsoluteUri       = childUri.AbsoluteUri,
                                IsBrowserRequired = true,
                            });
                        }
                    }
                    catch (UriFormatException e) //for mal-formated uris, just add them into the list without using System.Uri
                    {
                        if (linksToSave.All(o => o.AbsoluteUri != decoded))
                        {
                            linksToSave.Add(new CrawledLink()
                            {
                                AbsoluteUri       = decoded,
                                IsBrowserRequired = true,
                            });
                        }

                        Console.WriteLine(e);
                    }
                }
            }
            crawlResult.LinkAbsoluteUris = linksToSave;

            return(crawlResult);
        }
Пример #2
0
        public CrawlResult CrawlPage(CrawlPlan plan)
        {
            var crawlResult = new CrawlResult(plan.AbsoluteUri);

            var web = new HtmlWeb();

            web.CaptureRedirect = true;
            web.PreRequest      = request => WebPreRequest(request);
            web.PostResponse    = (request, response) => HtmlWeb_PostResponse(request, response);

            var stopWatch = new Stopwatch();

            stopWatch.Start();

            //Console.WriteLine($"loading...\t{plan.AbsoluteUri}");
            HtmlDocument doc;

            try
            {
                doc = web.Load(plan.AbsoluteUri);
            }
            catch (Exception e)
            {
                Console.WriteLine(e);

                //if (e is HtmlWebException && e.Message.StartsWith("Unsupported uri scheme:"))
                //{
                //}
                //else
                //{
                crawlResult.FailedAt      = DateTime.UtcNow;
                crawlResult.FailException = e.ToString();
                //}

                return(crawlResult);
            }

            stopWatch.Stop();

            var statusCode       = (int)web.StatusCode;
            var statusCodeString = web.StatusCode.ToString();
            var timeTaken        = stopWatch.Elapsed.TotalSeconds;

            crawlResult.CrawledAt     = DateTime.UtcNow;
            crawlResult.StatusCodeStr = statusCodeString;
            crawlResult.StatusCode    = statusCode;
            crawlResult.TimeTaken     = (decimal?)timeTaken;

            //not 200 OK
            if (web.StatusCode != HttpStatusCode.OK)
            {
                //Console.WriteLine($"\tstatus code = {statusCode}");

                //3xx redirect
                if (statusCode / 100 == 3)
                {
                    try
                    {
                        //Console.WriteLine($"\tfound redirect {statusCode} {_redirectLocation}");
                        var locationUri = Util.GetUriObjectFromUriString(_redirectLocation, plan.AbsoluteUri);
                        //Console.WriteLine($"\tfound new location {locationUri.AbsoluteUri}");

                        crawlResult.LocationAbsoluteUri = locationUri.AbsoluteUri;
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(e);
                    }
                }

                return(crawlResult);
            }

            //not a document
            if (doc.ParsedText == null)
            {
                return(crawlResult);
            }

            //crawlResult.Doc = doc;
            crawlResult.ContentLength = doc.ParsedText.Length;
            crawlResult.Content       = doc.ParsedText;

            //find links
            var links = doc.DocumentNode.SelectNodes("//a[@href]");

            if (links != null)
            {
                //Console.WriteLine($"\tfound {links.Count} child links.");
                var linksToSave = new List <CrawledLink>();
                foreach (var link in links)
                {
                    var href = link.Attributes["href"].Value;
                    //if(href.Contains("fortnite-stats/")) Debugger.Break();

                    var decoded = HttpUtility.HtmlDecode(href);
                    //if (decoded != href)
                    //    Debugger.Break();

                    if (decoded == "" || decoded.StartsWith("javascript:") || decoded.StartsWith("mailto:") || decoded.StartsWith("skype:"))
                    {
                        continue;
                    }

                    try
                    {
                        var childUri = Util.GetUriObjectFromUriString(decoded, plan.AbsoluteUri);

                        //no duplicated links
                        if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri))
                        {
                            linksToSave.Add(new CrawledLink()
                            {
                                AbsoluteUri = childUri.AbsoluteUri,
                            });
                        }
                    }
                    catch (UriFormatException e)//for mal-formated uris, just add them into the list without using System.Uri
                    {
                        if (linksToSave.All(o => o.AbsoluteUri != decoded))
                        {
                            linksToSave.Add(new CrawledLink()
                            {
                                AbsoluteUri = decoded,
                            });
                        }

                        Console.WriteLine(e);
                    }
                }

                crawlResult.LinkAbsoluteUris = linksToSave;
            }

            //find canonical
            var canonicalLinks = doc.DocumentNode.SelectNodes("//link[@rel='canonical']");

            if (canonicalLinks != null && canonicalLinks.Count > 0)
            {
                var canonicalLinkValue = canonicalLinks[0].Attributes["href"].Value;
                //Console.WriteLine($"\tfound canonical");
                crawlResult.Canonical = canonicalLinkValue;
            }

            return(crawlResult);
        }