示例#1
0
        internal static void AddReviewPagesToCrawl_Old(NCrawler.Crawler crawler, string lastPageUrl, int maxPage)
        {
            int   pageNum = 0;
            Match m       = pageNumberFromUrl.Match(lastPageUrl);

            if (m.Groups.Count > 1)
            {
                int.TryParse(m.Groups[1].Value, out pageNum);

                if (pageNum < maxPage)
                {
                    maxPage = pageNum; //if there are less than maxPage pages then only add up to last page
                }
            }
            else
            {
                maxPage = 0;
            }

            for (int i = 2; i <= maxPage; i++) //only crawl up to maxPage pages
            {
                //http://www.amazon.com/The-Screwtape-Letters-Proposes-Toast/product-reviews/0060652896/ref=cm_cr_pr_top_link_18?ie=UTF8&pageNumber=18&showViewpoints=0&sortBy=bySubmissionDateAscending
                string s = lastPageUrl.Replace("&pageNumber=" + pageNum + "&", "&pageNumber=" + i + "&");
                crawler.AddStep(new Uri(lastPageUrl.Replace("&pageNumber=" + pageNum + "&", "&pageNumber=" + i + "&")), 0);
            }
        }
        protected override void AddStepToCrawler(NCrawler.Crawler crawler, NCrawler.PropertyBag propertyBag, string normalizedLink, string link)
        {
            if (_logger != null)
            {
                _logger.DebugFormat("Crawler:AddStepToCrawler | {0}", normalizedLink);
            }

            base.AddStepToCrawler(crawler, propertyBag, normalizedLink, link);
        }
示例#3
0
        internal static void AddReviewPagesToCrawl(NCrawler.Crawler crawler, string baseUrl, int lastPageNum)
        {
            for (int i = 2; i <= lastPageNum; i++)
            {
                //http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852/ref=cm_cr_pr_btm_link_4?pageSize=50&pageNumber=4&sortBy=recent

                string url = baseUrl.Replace("&pageNumber=1", "&pageNumber=" + i);
                url = url.Replace("ref=cm_cr_pr_btm_link_1", "ref=cm_cr_pr_btm_link_" + i);
                crawler.AddStep(new Uri(url), 0);
            }
        }