public ArtistDataScraper() { var config = new CrawlConfiguration(); config.MaxConcurrentThreads = 1; config.IsExternalPageCrawlingEnabled = true; config.IsExternalPageLinksCrawlingEnabled = true; _crawler = new EasyWebCrawler(config); _crawedPages = new HashSet <int>(); _artistPages = new List <ArtistPage>(1000); _crawler.ShouldCrawlPageLinks((page, context) => { var query = page.Uri.Query; var allow = query.StartsWith("?page="); //Console.WriteLine("CrawlLink? : [{0}], {1} -> {2} ", allow ? "O" : "X", page.ParentUri, page.Uri); return(new CrawlDecision() { Allow = allow }); }); _crawler.ShouldCrawlPage((page, context) => { var query = page.Uri.Query; var parsedQuery = HttpUtility.ParseQueryString(query); var pageNumberString = parsedQuery.Get("page"); var allow = false; int pageNumber = -1; if (pageNumberString != null) { if (int.TryParse(pageNumberString, out pageNumber)) { allow = _crawedPages.Add(pageNumber); } else { pageNumber = -1; } } //Console.WriteLine("CrawlPage? : [{0}], {2} ", allow ? "O" : "X", page.ParentUri, page.Uri); return(new CrawlDecision() { Allow = allow }); }); _crawler.PageCrawlCompleted += CrawlerOnPageCrawlCompleted; }
public SongScraper() { var config = new CrawlConfiguration(); config.MaxConcurrentThreads = 1; config.MaxCrawlDepth = 1; config.IsExternalPageCrawlingEnabled = true; config.IsExternalPageLinksCrawlingEnabled = true; _crawler = new EasyWebCrawler(config); _crawledIds = new HashSet <string>(); _songs = new List <Song>(); _pathCorrector = new PathStringCorrector(); _crawler.ShouldCrawlPageLinks((page, context) => { var query = page.Uri.Query; var allow = !query.StartsWith("?id="); //Console.WriteLine("CrawlLink? : [{0}], {1} -> {2} ", allow ? "O" : "X", page.ParentUri, page.Uri); return(new CrawlDecision() { Allow = allow }); }); _crawler.ShouldCrawlPage((page, context) => { var query = page.Uri.Query; var parsedQuery = HttpUtility.ParseQueryString(query); var id = parsedQuery.Get("id"); var searchArtist = parsedQuery.Get("searchartist") == "1"; var allow = false; if (id != null) { allow = _crawledIds.Add(id); } else if (searchArtist) { allow = true; } //Console.WriteLine("CrawlPage? : [{0}], {2} ", allow ? "O" : "X", page.ParentUri, page.Uri); return(new CrawlDecision() { Allow = allow }); }); _crawler.PageCrawlCompleted += CrawlerOnPageCrawlCompleted; }