コード例 #1
0
        public void Should_crawl_website()
        {
            var configuration  = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser     = new HtmlParser();
            var pageCrawler    = new SinglePageCrawler(htmlParser, pageDownloader);

            var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler").DocumentStore;
            var persister     = new RavenDbCrawlPersister(documentStore);

            var urlHasher          = new UrlHasher();
            var crawlUrlRepository = new InMemoryCrawlUrlRepository();
            var websiteCrawler     = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister);

            var task = websiteCrawler.RunAsync(new Website
            {
                RootUrl = "http://www.karenmillen.com/",
                MaxConcurrentConnections = 100
            });

//			task.Wait(new TimeSpan(0, 10, 0));
//			task.Wait(new TimeSpan(0, 2, 0));
            task.Wait();

            task.Status.ShouldBeEquivalentTo(TaskStatus.RanToCompletion);

            var result = task.Result;

            Console.WriteLine("Crawl completed: {0} urls crawled in {1}", result.NumberOfPagesCrawled, (result.CrawlEnded - result.CrawlStarted).ToString());
        }
コード例 #2
0
        public void Crawl_WithTwoHtmlPages_ShouldReturnAllLinks()
        {
            //Arrange
            var parserResult1 = new List <Uri>()
            {
                new Uri("https://www.example.com/example1/"), new Uri("https://www.example.com/example2/")
            };
            var parserResult2 = new List <Uri>()
            {
                new Uri("https://www.example.com/example3/")
            };

            pageParserMock.SetupSequence(p => p.GetLinks(It.IsAny <string>(), It.IsAny <Uri>()))
            .Returns(parserResult1)
            .Returns(parserResult1)
            .Returns(parserResult2)
            .Returns(parserResult2);
            var websiteCrawler = new WebsiteCrawler();

            //Act
            var result = websiteCrawler.Crawl(new Uri("https://www.example.com/"), pageDownloaderMock.Object, pageParserMock.Object);

            //Assert
            Assert.Equal(4, result.Count);
            Assert.Contains(new Uri("https://www.example.com/"), result);
            Assert.Contains(new Uri("https://www.example.com/example1/"), result);
            Assert.Contains(new Uri("https://www.example.com/example2/"), result);
            Assert.Contains(new Uri("https://www.example.com/example3/"), result);
        }
コード例 #3
0
        static void Main()
        {
            XmlConfigurator.Configure();

            var log = LogManager.GetLogger(typeof(Program));

            var configuration  = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser     = new HtmlParser();
            var pageCrawler    = new SinglePageCrawler(htmlParser, pageDownloader);

            var urlHasher = new UrlHasher();

            var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler2").DocumentStore;
//			var documentStore = new DocumentStoreInitializer("http://SLB-4B6WZN1:8080", "NetCrawler2").DocumentStore;
            var persister = new RavenDbCrawlPersister(documentStore);

//			var crawlUrlRepository = new InMemoryCrawlUrlRepository();
            var crawlUrlRepository = new RedisCrawlUrlRepository();

            var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister);

            var task = websiteCrawler.RunAsync(new [] {
                new Website
                {
                    RootUrl = "http://www.karenmillen.com/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://uk.tommy.com/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://www.houseoffraser.co.uk/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://vladpetroff.com/",
                    MaxConcurrentConnections = 25
                },
            });

            var result = task.Result;

            log.InfoFormat("Crawl completed: {0} urls crawled in {1}", result.Sum(x => x.NumberOfPagesCrawled), (result.Max(x => x.CrawlEnded) - result.Min(x => x.CrawlStarted)).ToString());
        }
コード例 #4
0
        public void Start()
        {
            var validator      = new InputValidator();
            var websiteCrawler = new WebsiteCrawler();
            var sitemapCrawler = new SitemapCrawler();

            var differencePrinter = new LinksDifferencePrinter();
            var responcePrinter   = new ResponsePrinter();

            var performanceEvaluationGetter = new PerformanceEvaluationGetter();

            bool inputResult = false;
            Uri  WebsiteUrl  = null;;

            while (inputResult == false)
            {
                Console.WriteLine(@"Enter the website url (e.g. https://www.example.com/):");
                string websiteLink      = Console.ReadLine();
                var    validationResult = validator.Validate(websiteLink, new UrlValidator(), new RedirectionValidator());

                if (!validationResult)
                {
                    inputResult = false;
                }
                else
                {
                    WebsiteUrl  = new Uri(websiteLink);
                    inputResult = true;
                }
            }

            Console.WriteLine("Crawling website. It will take some time...");
            var websiteLinks = websiteCrawler.Crawl(WebsiteUrl, new PageDownloader(), new PageParser());

            Console.WriteLine("Crawling sitemap. It will take some time...");
            var sitemapLinks = sitemapCrawler.Crawl(WebsiteUrl, new SitemapLinkReceiver(), new PageDownloader(), new SitemapParser());

            differencePrinter.PrintDifference(sitemapLinks, websiteLinks);

            Console.WriteLine("Response time processing. It will take some time...");
            var combinedLinks = sitemapLinks.Union(websiteLinks).ToList();

            responcePrinter.PrintTable(performanceEvaluationGetter.PrepareLinks(combinedLinks, new PerformanceEvaluator()));

            Console.WriteLine("Enter to exit.");
            Console.ReadLine();
        }
コード例 #5
0
        public void Crawl_WithOneHtmlPage_ShouldReturnTwoLinks()
        {
            //Arrange
            pageParserMock.Setup(p => p.GetLinks(It.IsAny <string>(), It.IsAny <Uri>()))
            .Returns(new List <Uri>()
            {
                new Uri("https://www.example.com/example1/"), new Uri("https://www.example.com/example2/")
            });
            var websiteCrawler = new WebsiteCrawler();

            //Act
            var result = websiteCrawler.Crawl(new Uri("https://www.example.com/"), pageDownloaderMock.Object, pageParserMock.Object);

            //Assert
            Assert.Equal(3, result.Count);
            Assert.Contains(new Uri("https://www.example.com/"), result);
            Assert.Contains(new Uri("https://www.example.com/example1/"), result);
            Assert.Contains(new Uri("https://www.example.com/example2/"), result);
        }