public ArtistDataScraper()
        {
            var config = new CrawlConfiguration();

            config.MaxConcurrentThreads               = 1;
            config.IsExternalPageCrawlingEnabled      = true;
            config.IsExternalPageLinksCrawlingEnabled = true;
            _crawler     = new EasyWebCrawler(config);
            _crawedPages = new HashSet <int>();
            _artistPages = new List <ArtistPage>(1000);

            _crawler.ShouldCrawlPageLinks((page, context) =>
            {
                var query = page.Uri.Query;
                var allow = query.StartsWith("?page=");

                //Console.WriteLine("CrawlLink? : [{0}], {1} -> {2} ", allow ? "O" : "X", page.ParentUri, page.Uri);
                return(new CrawlDecision()
                {
                    Allow = allow
                });
            });
            _crawler.ShouldCrawlPage((page, context) =>
            {
                var query            = page.Uri.Query;
                var parsedQuery      = HttpUtility.ParseQueryString(query);
                var pageNumberString = parsedQuery.Get("page");
                var allow            = false;
                int pageNumber       = -1;
                if (pageNumberString != null)
                {
                    if (int.TryParse(pageNumberString, out pageNumber))
                    {
                        allow = _crawedPages.Add(pageNumber);
                    }
                    else
                    {
                        pageNumber = -1;
                    }
                }
                //Console.WriteLine("CrawlPage? : [{0}], {2} ", allow ? "O" : "X", page.ParentUri, page.Uri);
                return(new CrawlDecision()
                {
                    Allow = allow
                });
            });
            _crawler.PageCrawlCompleted += CrawlerOnPageCrawlCompleted;
        }
Example #2
0
        public SongScraper()
        {
            var config = new CrawlConfiguration();

            config.MaxConcurrentThreads               = 1;
            config.MaxCrawlDepth                      = 1;
            config.IsExternalPageCrawlingEnabled      = true;
            config.IsExternalPageLinksCrawlingEnabled = true;
            _crawler       = new EasyWebCrawler(config);
            _crawledIds    = new HashSet <string>();
            _songs         = new List <Song>();
            _pathCorrector = new PathStringCorrector();

            _crawler.ShouldCrawlPageLinks((page, context) =>
            {
                var query = page.Uri.Query;
                var allow = !query.StartsWith("?id=");
                //Console.WriteLine("CrawlLink? : [{0}], {1} -> {2} ", allow ? "O" : "X", page.ParentUri, page.Uri);
                return(new CrawlDecision()
                {
                    Allow = allow
                });
            });
            _crawler.ShouldCrawlPage((page, context) =>
            {
                var query        = page.Uri.Query;
                var parsedQuery  = HttpUtility.ParseQueryString(query);
                var id           = parsedQuery.Get("id");
                var searchArtist = parsedQuery.Get("searchartist") == "1";

                var allow = false;
                if (id != null)
                {
                    allow = _crawledIds.Add(id);
                }
                else if (searchArtist)
                {
                    allow = true;
                }
                //Console.WriteLine("CrawlPage? : [{0}], {2} ", allow ? "O" : "X", page.ParentUri, page.Uri);
                return(new CrawlDecision()
                {
                    Allow = allow
                });
            });
            _crawler.PageCrawlCompleted += CrawlerOnPageCrawlCompleted;
        }