public void Should_crawl_website() { var configuration = new Configuration(); var pageDownloader = new PageDownloader(configuration); var htmlParser = new HtmlParser(); var pageCrawler = new SinglePageCrawler(htmlParser, pageDownloader); var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler").DocumentStore; var persister = new RavenDbCrawlPersister(documentStore); var urlHasher = new UrlHasher(); var crawlUrlRepository = new InMemoryCrawlUrlRepository(); var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister); var task = websiteCrawler.RunAsync(new Website { RootUrl = "http://www.karenmillen.com/", MaxConcurrentConnections = 100 }); // task.Wait(new TimeSpan(0, 10, 0)); // task.Wait(new TimeSpan(0, 2, 0)); task.Wait(); task.Status.ShouldBeEquivalentTo(TaskStatus.RanToCompletion); var result = task.Result; Console.WriteLine("Crawl completed: {0} urls crawled in {1}", result.NumberOfPagesCrawled, (result.CrawlEnded - result.CrawlStarted).ToString()); }
public void Crawl_WithTwoHtmlPages_ShouldReturnAllLinks() { //Arrange var parserResult1 = new List <Uri>() { new Uri("https://www.example.com/example1/"), new Uri("https://www.example.com/example2/") }; var parserResult2 = new List <Uri>() { new Uri("https://www.example.com/example3/") }; pageParserMock.SetupSequence(p => p.GetLinks(It.IsAny <string>(), It.IsAny <Uri>())) .Returns(parserResult1) .Returns(parserResult1) .Returns(parserResult2) .Returns(parserResult2); var websiteCrawler = new WebsiteCrawler(); //Act var result = websiteCrawler.Crawl(new Uri("https://www.example.com/"), pageDownloaderMock.Object, pageParserMock.Object); //Assert Assert.Equal(4, result.Count); Assert.Contains(new Uri("https://www.example.com/"), result); Assert.Contains(new Uri("https://www.example.com/example1/"), result); Assert.Contains(new Uri("https://www.example.com/example2/"), result); Assert.Contains(new Uri("https://www.example.com/example3/"), result); }
static void Main() { XmlConfigurator.Configure(); var log = LogManager.GetLogger(typeof(Program)); var configuration = new Configuration(); var pageDownloader = new PageDownloader(configuration); var htmlParser = new HtmlParser(); var pageCrawler = new SinglePageCrawler(htmlParser, pageDownloader); var urlHasher = new UrlHasher(); var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler2").DocumentStore; // var documentStore = new DocumentStoreInitializer("http://SLB-4B6WZN1:8080", "NetCrawler2").DocumentStore; var persister = new RavenDbCrawlPersister(documentStore); // var crawlUrlRepository = new InMemoryCrawlUrlRepository(); var crawlUrlRepository = new RedisCrawlUrlRepository(); var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister); var task = websiteCrawler.RunAsync(new [] { new Website { RootUrl = "http://www.karenmillen.com/", MaxConcurrentConnections = 25 }, new Website { RootUrl = "http://uk.tommy.com/", MaxConcurrentConnections = 25 }, new Website { RootUrl = "http://www.houseoffraser.co.uk/", MaxConcurrentConnections = 25 }, new Website { RootUrl = "http://vladpetroff.com/", MaxConcurrentConnections = 25 }, }); var result = task.Result; log.InfoFormat("Crawl completed: {0} urls crawled in {1}", result.Sum(x => x.NumberOfPagesCrawled), (result.Max(x => x.CrawlEnded) - result.Min(x => x.CrawlStarted)).ToString()); }
public void Start() { var validator = new InputValidator(); var websiteCrawler = new WebsiteCrawler(); var sitemapCrawler = new SitemapCrawler(); var differencePrinter = new LinksDifferencePrinter(); var responcePrinter = new ResponsePrinter(); var performanceEvaluationGetter = new PerformanceEvaluationGetter(); bool inputResult = false; Uri WebsiteUrl = null;; while (inputResult == false) { Console.WriteLine(@"Enter the website url (e.g. https://www.example.com/):"); string websiteLink = Console.ReadLine(); var validationResult = validator.Validate(websiteLink, new UrlValidator(), new RedirectionValidator()); if (!validationResult) { inputResult = false; } else { WebsiteUrl = new Uri(websiteLink); inputResult = true; } } Console.WriteLine("Crawling website. It will take some time..."); var websiteLinks = websiteCrawler.Crawl(WebsiteUrl, new PageDownloader(), new PageParser()); Console.WriteLine("Crawling sitemap. It will take some time..."); var sitemapLinks = sitemapCrawler.Crawl(WebsiteUrl, new SitemapLinkReceiver(), new PageDownloader(), new SitemapParser()); differencePrinter.PrintDifference(sitemapLinks, websiteLinks); Console.WriteLine("Response time processing. It will take some time..."); var combinedLinks = sitemapLinks.Union(websiteLinks).ToList(); responcePrinter.PrintTable(performanceEvaluationGetter.PrepareLinks(combinedLinks, new PerformanceEvaluator())); Console.WriteLine("Enter to exit."); Console.ReadLine(); }
public void Crawl_WithOneHtmlPage_ShouldReturnTwoLinks() { //Arrange pageParserMock.Setup(p => p.GetLinks(It.IsAny <string>(), It.IsAny <Uri>())) .Returns(new List <Uri>() { new Uri("https://www.example.com/example1/"), new Uri("https://www.example.com/example2/") }); var websiteCrawler = new WebsiteCrawler(); //Act var result = websiteCrawler.Crawl(new Uri("https://www.example.com/"), pageDownloaderMock.Object, pageParserMock.Object); //Assert Assert.Equal(3, result.Count); Assert.Contains(new Uri("https://www.example.com/"), result); Assert.Contains(new Uri("https://www.example.com/example1/"), result); Assert.Contains(new Uri("https://www.example.com/example2/"), result); }