public void TestChainTwoLinks() { MockRepository mocks = new MockRepository(); var web = mocks.StrictMock<IWebInteractor>(); var mockFSI = MockRepository.GenerateStub<IFileSystemInteractor>(); CrawlResult testResult = new CrawlResult(); testResult.ReturnCode = 200; testResult.ReturnStatus = "OK"; testResult.Html = "href=\"/csse.html\""; var resultTwo = new CrawlResult(); resultTwo.ReturnCode = 200; resultTwo.ReturnStatus = "OK"; resultTwo.Html = "href=\"/abbe.html\""; var resultThree = new CrawlResult(); resultThree.ReturnCode = 200; resultThree.ReturnStatus = "OK"; resultThree.Html = ""; Expect.On(web).Call(web.GetPage("www.test.com")).Return(testResult); Expect.On(web).Call(web.GetPage("www.test.com/csse.html")).Return(resultTwo); Expect.On(web).Call(web.GetPage("www.test.com/abbe.html")).Return(resultThree); mocks.ReplayAll(); Bot b = new Bot(new Website("www.test.com","simplepath"),null,null,web,mockFSI); List<CrawlResult> results = b.CrawlSite(2); mocks.VerifyAll(); Assert.AreEqual(3,results.Count); }
static void Main(string[] args) { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri("https://www.goodreads.com/quotes/tag/motivation?page=2")); //This is synchronous, it will not go to the next line until the crawl has completed //if (result.ErrorOccurred) // Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); //else // Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); Console.ReadLine(); }
void testWebAbot() { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 50; crawlConfig.MaxPagesToCrawl = 3000; crawlConfig.UserAgentString = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"; crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111"); crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222"); crawlConfig.IsSendingCookiesEnabled = false; crawlConfig.HttpServicePointConnectionLimit = 200; crawlConfig.MaxLinksPerPage = 3000; crawlConfig.MaxCrawlDepth = 10000; crawlConfig.IsExternalPageCrawlingEnabled = true; crawlConfig.MaxPagesToCrawlPerDomain = 10000; crawlConfig.MaxLinksPerPage = 0; crawlConfig.IsHttpRequestAutoRedirectsEnabled = true; crawlConfig.MaxMemoryUsageInMb = 0; var crawler = new PoliteWebCrawler(crawlConfig); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri(textBox1.Text)); //This is synchronous, it will not go to the next line until the crawl has completed if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
// private string _basePath; public CrawlResult GetPage(string url) { CrawlResult cr = new CrawlResult(); int indexOfSlash = url.IndexOf('/'); string urlExtension = ""; if (indexOfSlash > -1) urlExtension = url.Substring(indexOfSlash); HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create("http://" + url); request.Method = "GET"; request.AllowAutoRedirect = false; try { HttpWebResponse response = (HttpWebResponse)request.GetResponse(); var reader = new StreamReader(response.GetResponseStream()); var html = reader.ReadToEnd(); cr.ReturnCode = (int) response.StatusCode; cr.ReturnStatus = response.StatusDescription; cr.Html = html; } catch (WebException we) { // var code = ((HttpWebResponse)we.Response).StatusCode; // cr.ReturnCode = (int) code; // cr.ReturnStatus = code.ToString(); // _log.writeError(String.Format(@"Error: {0} ({1}) {2}",((HttpWebResponse)we.Response).ResponseUri, (int)code, code)); _log.writeError("Error getting site " + url); } return cr; }
public void TestStatus404() { MockRepository mocks = new MockRepository(); var web = mocks.StrictMock<IWebInteractor>(); var site = new Website("www.whocares.com", "whatever"); var mockFSI = MockRepository.GenerateStub<FileSystemInteractor>(); var retVal = new CrawlResult(); retVal.ReturnCode = 404; retVal.Html = @""; Expect.On(web).Call(web.GetPage("www.whocares.com")).Return(retVal); mocks.ReplayAll(); var useableBot = new Bot(site, null, null, web, mockFSI); var check = useableBot.CrawlSite(1); Assert.IsTrue(check[0].ReturnCode == 404); mocks.VerifyAll(); }
public void TestHttpInternalWithWwwLink() { MockRepository mocks = new MockRepository(); var web = mocks.StrictMock<IWebInteractor>(); var site = new Website("www.test.com", "whatever"); var mockFSI = MockRepository.GenerateStub<IFileSystemInteractor>(); CrawlResult testResult = new CrawlResult(); testResult.ReturnCode = 200; testResult.ReturnStatus = "OK"; testResult.Html = "href=\"http://www.test.com/csse.html\""; var resultTwo = new CrawlResult(); resultTwo.ReturnCode = 200; resultTwo.ReturnStatus = "OK"; resultTwo.Html = ""; Expect.On(web).Call(web.GetPage("www.test.com")).Return(testResult); Expect.On(web).Call(web.GetPage("www.test.com/csse.html")).Return(resultTwo); mocks.ReplayAll(); Bot b = new Bot(site,null,null,web,mockFSI); List<CrawlResult> results = b.CrawlSite(2); mocks.VerifyAll(); Assert.AreEqual(2,results.Count); }
public CrawlResult CrawlPage(CrawlPlan plan) { var crawlResult = new CrawlResult(plan.AbsoluteUri); string driverPageSource; //using (IWebDriver driver = new ChromeDriver(new ChromeOptions() { })) //{ //WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(10)); try { //todo: disable redirect driver.Navigate().GoToUrl(plan.AbsoluteUri); if (_pageLoadWait > 0) { Thread.Sleep(TimeSpan.FromSeconds(_pageLoadWait)); } driverPageSource = driver.PageSource; } catch (Exception e) { Console.WriteLine(e); crawlResult.BrowserFailedAt = DateTime.UtcNow; crawlResult.BrowserFailedException = e.ToString(); //throw; return(crawlResult); } //string html = driver.ExecuteJavaScript<string>("return document.documentElement.outerHTML;"); //driver.FindElement(By.Name("q")).SendKeys("cheese" + Keys.Enter); //IWebElement firstResult = wait.Until(ExpectedConditions.ElementExists(By.TagName("h1"))); //Console.WriteLine(firstResult.GetAttribute("textContent")); //} //crawlResult.Doc = doc; crawlResult.BrowserCrawledAt = DateTime.UtcNow; crawlResult.BrowserContent = driverPageSource; var linkSets = new List <List <string> >(); //find links var linkElements = driver.FindElements(By.XPath("//a[@href]")); if (linkElements != null && linkElements.Count > 0) { linkSets.Add(linkElements.Select(o => o.GetAttribute("href")).ToList()); } //------------------------------------------ //Some Webpages DO NOT show all elements unless interacted with e.g. hovering) //try hovering and findind more contents //------------------------------------------ var hoverableList = new string[] { //"//p[.='Creators']", "//p[.='Learn more']", "//p[.='Crypto Community']" }; if (hoverableList != null && hoverableList.Length > 0) { var action = new Actions(driver); foreach (var hoverableXPath in hoverableList) { var hoverableElement = driver.FindElement(By.XPath(hoverableXPath)); if (hoverableElement != null) { action.MoveToElement(hoverableElement).Perform(); linkElements = driver.FindElements(By.XPath("//a[@href]")); if (linkElements != null && linkElements.Count > 0) { linkSets.Add(linkElements.Select(o => o.GetAttribute("href")).ToList()); } } } } var linksToSave = new List <CrawledLink>(); foreach (var linkSet in linkSets) { //Console.WriteLine($"\tfound {links.Count} child links."); foreach (var href in linkSet) { //var href = link.GetAttribute("href"); //if(href.Contains("fortnite-stats/")) Debugger.Break(); var decoded = HttpUtility.HtmlDecode(href); //if (decoded != href) // Debugger.Break(); if (decoded == "" || decoded.StartsWith("javascript:") || decoded.StartsWith("mailto:") || decoded.StartsWith("skype:")) { continue; } try { var childUri = Util.GetUriObjectFromUriString(decoded, plan.AbsoluteUri); //no duplicated links if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri)) { linksToSave.Add(new CrawledLink() { AbsoluteUri = childUri.AbsoluteUri, IsBrowserRequired = true, }); } } catch (UriFormatException e) //for mal-formated uris, just add them into the list without using System.Uri { if (linksToSave.All(o => o.AbsoluteUri != decoded)) { linksToSave.Add(new CrawledLink() { AbsoluteUri = decoded, IsBrowserRequired = true, }); } Console.WriteLine(e); } } } crawlResult.LinkAbsoluteUris = linksToSave; return(crawlResult); }
public CrawlResult CrawlPage(CrawlPlan plan) { var crawlResult = new CrawlResult(plan.AbsoluteUri); var web = new HtmlWeb(); web.CaptureRedirect = true; web.PreRequest = request => WebPreRequest(request); web.PostResponse = (request, response) => HtmlWeb_PostResponse(request, response); var stopWatch = new Stopwatch(); stopWatch.Start(); //Console.WriteLine($"loading...\t{plan.AbsoluteUri}"); HtmlDocument doc; try { doc = web.Load(plan.AbsoluteUri); } catch (Exception e) { Console.WriteLine(e); //if (e is HtmlWebException && e.Message.StartsWith("Unsupported uri scheme:")) //{ //} //else //{ crawlResult.FailedAt = DateTime.UtcNow; crawlResult.FailException = e.ToString(); //} return(crawlResult); } stopWatch.Stop(); var statusCode = (int)web.StatusCode; var statusCodeString = web.StatusCode.ToString(); var timeTaken = stopWatch.Elapsed.TotalSeconds; crawlResult.CrawledAt = DateTime.UtcNow; crawlResult.StatusCodeStr = statusCodeString; crawlResult.StatusCode = statusCode; crawlResult.TimeTaken = (decimal?)timeTaken; //not 200 OK if (web.StatusCode != HttpStatusCode.OK) { //Console.WriteLine($"\tstatus code = {statusCode}"); //3xx redirect if (statusCode / 100 == 3) { try { //Console.WriteLine($"\tfound redirect {statusCode} {_redirectLocation}"); var locationUri = Util.GetUriObjectFromUriString(_redirectLocation, plan.AbsoluteUri); //Console.WriteLine($"\tfound new location {locationUri.AbsoluteUri}"); crawlResult.LocationAbsoluteUri = locationUri.AbsoluteUri; } catch (Exception e) { Console.WriteLine(e); } } return(crawlResult); } //not a document if (doc.ParsedText == null) { return(crawlResult); } //crawlResult.Doc = doc; crawlResult.ContentLength = doc.ParsedText.Length; crawlResult.Content = doc.ParsedText; //find links var links = doc.DocumentNode.SelectNodes("//a[@href]"); if (links != null) { //Console.WriteLine($"\tfound {links.Count} child links."); var linksToSave = new List <CrawledLink>(); foreach (var link in links) { var href = link.Attributes["href"].Value; //if(href.Contains("fortnite-stats/")) Debugger.Break(); var decoded = HttpUtility.HtmlDecode(href); //if (decoded != href) // Debugger.Break(); if (decoded == "" || decoded.StartsWith("javascript:") || decoded.StartsWith("mailto:") || decoded.StartsWith("skype:")) { continue; } try { var childUri = Util.GetUriObjectFromUriString(decoded, plan.AbsoluteUri); //no duplicated links if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri)) { linksToSave.Add(new CrawledLink() { AbsoluteUri = childUri.AbsoluteUri, }); } } catch (UriFormatException e)//for mal-formated uris, just add them into the list without using System.Uri { if (linksToSave.All(o => o.AbsoluteUri != decoded)) { linksToSave.Add(new CrawledLink() { AbsoluteUri = decoded, }); } Console.WriteLine(e); } } crawlResult.LinkAbsoluteUris = linksToSave; } //find canonical var canonicalLinks = doc.DocumentNode.SelectNodes("//link[@rel='canonical']"); if (canonicalLinks != null && canonicalLinks.Count > 0) { var canonicalLinkValue = canonicalLinks[0].Attributes["href"].Value; //Console.WriteLine($"\tfound canonical"); crawlResult.Canonical = canonicalLinkValue; } return(crawlResult); }