public CrawlResult CrawlPage(CrawlPlan plan) { var crawlResult = new CrawlResult(plan.AbsoluteUri); string driverPageSource; //using (IWebDriver driver = new ChromeDriver(new ChromeOptions() { })) //{ //WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(10)); try { //todo: disable redirect driver.Navigate().GoToUrl(plan.AbsoluteUri); if (_pageLoadWait > 0) { Thread.Sleep(TimeSpan.FromSeconds(_pageLoadWait)); } driverPageSource = driver.PageSource; } catch (Exception e) { Console.WriteLine(e); crawlResult.BrowserFailedAt = DateTime.UtcNow; crawlResult.BrowserFailedException = e.ToString(); //throw; return(crawlResult); } //string html = driver.ExecuteJavaScript<string>("return document.documentElement.outerHTML;"); //driver.FindElement(By.Name("q")).SendKeys("cheese" + Keys.Enter); //IWebElement firstResult = wait.Until(ExpectedConditions.ElementExists(By.TagName("h1"))); //Console.WriteLine(firstResult.GetAttribute("textContent")); //} //crawlResult.Doc = doc; crawlResult.BrowserCrawledAt = DateTime.UtcNow; crawlResult.BrowserContent = driverPageSource; var linkSets = new List <List <string> >(); //find links var linkElements = driver.FindElements(By.XPath("//a[@href]")); if (linkElements != null && linkElements.Count > 0) { linkSets.Add(linkElements.Select(o => o.GetAttribute("href")).ToList()); } //------------------------------------------ //Some Webpages DO NOT show all elements unless interacted with e.g. hovering) //try hovering and findind more contents //------------------------------------------ var hoverableList = new string[] { //"//p[.='Creators']", "//p[.='Learn more']", "//p[.='Crypto Community']" }; if (hoverableList != null && hoverableList.Length > 0) { var action = new Actions(driver); foreach (var hoverableXPath in hoverableList) { var hoverableElement = driver.FindElement(By.XPath(hoverableXPath)); if (hoverableElement != null) { action.MoveToElement(hoverableElement).Perform(); linkElements = driver.FindElements(By.XPath("//a[@href]")); if (linkElements != null && linkElements.Count > 0) { linkSets.Add(linkElements.Select(o => o.GetAttribute("href")).ToList()); } } } } var linksToSave = new List <CrawledLink>(); foreach (var linkSet in linkSets) { //Console.WriteLine($"\tfound {links.Count} child links."); foreach (var href in linkSet) { //var href = link.GetAttribute("href"); //if(href.Contains("fortnite-stats/")) Debugger.Break(); var decoded = HttpUtility.HtmlDecode(href); //if (decoded != href) // Debugger.Break(); if (decoded == "" || decoded.StartsWith("javascript:") || decoded.StartsWith("mailto:") || decoded.StartsWith("skype:")) { continue; } try { var childUri = Util.GetUriObjectFromUriString(decoded, plan.AbsoluteUri); //no duplicated links if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri)) { linksToSave.Add(new CrawledLink() { AbsoluteUri = childUri.AbsoluteUri, IsBrowserRequired = true, }); } } catch (UriFormatException e) //for mal-formated uris, just add them into the list without using System.Uri { if (linksToSave.All(o => o.AbsoluteUri != decoded)) { linksToSave.Add(new CrawledLink() { AbsoluteUri = decoded, IsBrowserRequired = true, }); } Console.WriteLine(e); } } } crawlResult.LinkAbsoluteUris = linksToSave; return(crawlResult); }
public CrawlResult CrawlPage(CrawlPlan plan) { var crawlResult = new CrawlResult(plan.AbsoluteUri); var web = new HtmlWeb(); web.CaptureRedirect = true; web.PreRequest = request => WebPreRequest(request); web.PostResponse = (request, response) => HtmlWeb_PostResponse(request, response); var stopWatch = new Stopwatch(); stopWatch.Start(); //Console.WriteLine($"loading...\t{plan.AbsoluteUri}"); HtmlDocument doc; try { doc = web.Load(plan.AbsoluteUri); } catch (Exception e) { Console.WriteLine(e); //if (e is HtmlWebException && e.Message.StartsWith("Unsupported uri scheme:")) //{ //} //else //{ crawlResult.FailedAt = DateTime.UtcNow; crawlResult.FailException = e.ToString(); //} return(crawlResult); } stopWatch.Stop(); var statusCode = (int)web.StatusCode; var statusCodeString = web.StatusCode.ToString(); var timeTaken = stopWatch.Elapsed.TotalSeconds; crawlResult.CrawledAt = DateTime.UtcNow; crawlResult.StatusCodeStr = statusCodeString; crawlResult.StatusCode = statusCode; crawlResult.TimeTaken = (decimal?)timeTaken; //not 200 OK if (web.StatusCode != HttpStatusCode.OK) { //Console.WriteLine($"\tstatus code = {statusCode}"); //3xx redirect if (statusCode / 100 == 3) { try { //Console.WriteLine($"\tfound redirect {statusCode} {_redirectLocation}"); var locationUri = Util.GetUriObjectFromUriString(_redirectLocation, plan.AbsoluteUri); //Console.WriteLine($"\tfound new location {locationUri.AbsoluteUri}"); crawlResult.LocationAbsoluteUri = locationUri.AbsoluteUri; } catch (Exception e) { Console.WriteLine(e); } } return(crawlResult); } //not a document if (doc.ParsedText == null) { return(crawlResult); } //crawlResult.Doc = doc; crawlResult.ContentLength = doc.ParsedText.Length; crawlResult.Content = doc.ParsedText; //find links var links = doc.DocumentNode.SelectNodes("//a[@href]"); if (links != null) { //Console.WriteLine($"\tfound {links.Count} child links."); var linksToSave = new List <CrawledLink>(); foreach (var link in links) { var href = link.Attributes["href"].Value; //if(href.Contains("fortnite-stats/")) Debugger.Break(); var decoded = HttpUtility.HtmlDecode(href); //if (decoded != href) // Debugger.Break(); if (decoded == "" || decoded.StartsWith("javascript:") || decoded.StartsWith("mailto:") || decoded.StartsWith("skype:")) { continue; } try { var childUri = Util.GetUriObjectFromUriString(decoded, plan.AbsoluteUri); //no duplicated links if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri)) { linksToSave.Add(new CrawledLink() { AbsoluteUri = childUri.AbsoluteUri, }); } } catch (UriFormatException e)//for mal-formated uris, just add them into the list without using System.Uri { if (linksToSave.All(o => o.AbsoluteUri != decoded)) { linksToSave.Add(new CrawledLink() { AbsoluteUri = decoded, }); } Console.WriteLine(e); } } crawlResult.LinkAbsoluteUris = linksToSave; } //find canonical var canonicalLinks = doc.DocumentNode.SelectNodes("//link[@rel='canonical']"); if (canonicalLinks != null && canonicalLinks.Count > 0) { var canonicalLinkValue = canonicalLinks[0].Attributes["href"].Value; //Console.WriteLine($"\tfound canonical"); crawlResult.Canonical = canonicalLinkValue; } return(crawlResult); }