public void Should_crawl_website()
        {
            var configuration = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser = new HtmlParser();
            var pageCrawler = new SinglePageCrawler(htmlParser, pageDownloader);

            var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler").DocumentStore;
            var persister = new RavenDbCrawlPersister(documentStore);

            var urlHasher = new UrlHasher();
            var crawlUrlRepository = new InMemoryCrawlUrlRepository();
            var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister);

            var task = websiteCrawler.RunAsync(new Website
                {
                    RootUrl = "http://www.karenmillen.com/",
                    MaxConcurrentConnections = 100
                });

            //			task.Wait(new TimeSpan(0, 10, 0));
            //			task.Wait(new TimeSpan(0, 2, 0));
            task.Wait();

            task.Status.ShouldBeEquivalentTo(TaskStatus.RanToCompletion);

            var result = task.Result;

            Console.WriteLine("Crawl completed: {0} urls crawled in {1}", result.NumberOfPagesCrawled, (result.CrawlEnded - result.CrawlStarted).ToString());
        }
        public void Should_extract_links_from_page()
        {
            var configuration = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser = new HtmlParser();

            var crawler = new SinglePageCrawler(htmlParser, pageDownloader);

            var result = crawler.Crawl(new Uri("http://vladpetroff.com"));
        }
Esempio n. 3
0
        static void Main()
        {
            XmlConfigurator.Configure();

            var log = LogManager.GetLogger(typeof (Program));

            var configuration = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser = new HtmlParser();
            var pageCrawler = new SinglePageCrawler(htmlParser, pageDownloader);

            var urlHasher = new UrlHasher();

            var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler2").DocumentStore;
            //			var documentStore = new DocumentStoreInitializer("http://SLB-4B6WZN1:8080", "NetCrawler2").DocumentStore;
            var persister = new RavenDbCrawlPersister(documentStore);

            //			var crawlUrlRepository = new InMemoryCrawlUrlRepository();
            var crawlUrlRepository = new RedisCrawlUrlRepository();

            var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister);

            var task = websiteCrawler.RunAsync(new [] {
                new Website
                {
                    RootUrl = "http://www.karenmillen.com/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://uk.tommy.com/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://www.houseoffraser.co.uk/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://vladpetroff.com/",
                    MaxConcurrentConnections = 25
                },
            });

            var result = task.Result;

            log.InfoFormat("Crawl completed: {0} urls crawled in {1}", result.Sum(x => x.NumberOfPagesCrawled), (result.Max(x => x.CrawlEnded) - result.Min(x => x.CrawlStarted)).ToString());
        }