Пример #1
0
        public void Should_crawl_website()
        {
            var configuration  = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser     = new HtmlParser();
            var pageCrawler    = new SinglePageCrawler(htmlParser, pageDownloader);

            var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler").DocumentStore;
            var persister     = new RavenDbCrawlPersister(documentStore);

            var urlHasher          = new UrlHasher();
            var crawlUrlRepository = new InMemoryCrawlUrlRepository();
            var websiteCrawler     = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister);

            var task = websiteCrawler.RunAsync(new Website
            {
                RootUrl = "http://www.karenmillen.com/",
                MaxConcurrentConnections = 100
            });

//			task.Wait(new TimeSpan(0, 10, 0));
//			task.Wait(new TimeSpan(0, 2, 0));
            task.Wait();

            task.Status.ShouldBeEquivalentTo(TaskStatus.RanToCompletion);

            var result = task.Result;

            Console.WriteLine("Crawl completed: {0} urls crawled in {1}", result.NumberOfPagesCrawled, (result.CrawlEnded - result.CrawlStarted).ToString());
        }
Пример #2
0
        static void Main()
        {
            XmlConfigurator.Configure();

            var log = LogManager.GetLogger(typeof(Program));

            var configuration  = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser     = new HtmlParser();
            var pageCrawler    = new SinglePageCrawler(htmlParser, pageDownloader);

            var urlHasher = new UrlHasher();

            var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler2").DocumentStore;
//			var documentStore = new DocumentStoreInitializer("http://SLB-4B6WZN1:8080", "NetCrawler2").DocumentStore;
            var persister = new RavenDbCrawlPersister(documentStore);

//			var crawlUrlRepository = new InMemoryCrawlUrlRepository();
            var crawlUrlRepository = new RedisCrawlUrlRepository();

            var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister);

            var task = websiteCrawler.RunAsync(new [] {
                new Website
                {
                    RootUrl = "http://www.karenmillen.com/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://uk.tommy.com/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://www.houseoffraser.co.uk/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://vladpetroff.com/",
                    MaxConcurrentConnections = 25
                },
            });

            var result = task.Result;

            log.InfoFormat("Crawl completed: {0} urls crawled in {1}", result.Sum(x => x.NumberOfPagesCrawled), (result.Max(x => x.CrawlEnded) - result.Min(x => x.CrawlStarted)).ToString());
        }