public void Should_crawl_website() { var configuration = new Configuration(); var pageDownloader = new PageDownloader(configuration); var htmlParser = new HtmlParser(); var pageCrawler = new SinglePageCrawler(htmlParser, pageDownloader); var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler").DocumentStore; var persister = new RavenDbCrawlPersister(documentStore); var urlHasher = new UrlHasher(); var crawlUrlRepository = new InMemoryCrawlUrlRepository(); var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister); var task = websiteCrawler.RunAsync(new Website { RootUrl = "http://www.karenmillen.com/", MaxConcurrentConnections = 100 }); // task.Wait(new TimeSpan(0, 10, 0)); // task.Wait(new TimeSpan(0, 2, 0)); task.Wait(); task.Status.ShouldBeEquivalentTo(TaskStatus.RanToCompletion); var result = task.Result; Console.WriteLine("Crawl completed: {0} urls crawled in {1}", result.NumberOfPagesCrawled, (result.CrawlEnded - result.CrawlStarted).ToString()); }
public void Should_extract_links_from_page() { var configuration = new Configuration(); var pageDownloader = new PageDownloader(configuration); var htmlParser = new HtmlParser(); var crawler = new SinglePageCrawler(htmlParser, pageDownloader); var result = crawler.Crawl(new Uri("http://vladpetroff.com")); }
static void Main() { XmlConfigurator.Configure(); var log = LogManager.GetLogger(typeof (Program)); var configuration = new Configuration(); var pageDownloader = new PageDownloader(configuration); var htmlParser = new HtmlParser(); var pageCrawler = new SinglePageCrawler(htmlParser, pageDownloader); var urlHasher = new UrlHasher(); var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler2").DocumentStore; // var documentStore = new DocumentStoreInitializer("http://SLB-4B6WZN1:8080", "NetCrawler2").DocumentStore; var persister = new RavenDbCrawlPersister(documentStore); // var crawlUrlRepository = new InMemoryCrawlUrlRepository(); var crawlUrlRepository = new RedisCrawlUrlRepository(); var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister); var task = websiteCrawler.RunAsync(new [] { new Website { RootUrl = "http://www.karenmillen.com/", MaxConcurrentConnections = 25 }, new Website { RootUrl = "http://uk.tommy.com/", MaxConcurrentConnections = 25 }, new Website { RootUrl = "http://www.houseoffraser.co.uk/", MaxConcurrentConnections = 25 }, new Website { RootUrl = "http://vladpetroff.com/", MaxConcurrentConnections = 25 }, }); var result = task.Result; log.InfoFormat("Crawl completed: {0} urls crawled in {1}", result.Sum(x => x.NumberOfPagesCrawled), (result.Max(x => x.CrawlEnded) - result.Min(x => x.CrawlStarted)).ToString()); }