コード例 #1
0
        public static void Run(SavePageForm parentForm, string url)
        {
            Form = parentForm;
            Url  = url;

            c = new Crawler(new Uri(url),
                            new HtmlDocumentProcessor(), // Process html
                            new SaveFileStep());

            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;
            c.CrawlFinished     += new EventHandler <NCrawler.Events.CrawlFinishedEventArgs>(c_CrawlFinished);

            string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);

            //if there are no unblocked user agents left then reset the tracker and retry
            if (ua == null)
            {
                UserAgentTracker = CrawlUtil.InitUserAgentTracker();
                ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);
            }
            c.UserAgent = ua;

            // Begin crawl
            c.Crawl();
        }
コード例 #2
0
        /*
         * [numFiveStarRatings][numFourStarRatings][numThreeStarRatings][numTwoStarRatings][numOneStarRatings]
         * <tr>
         *  <td align="left" style="padding-right:0.5em;padding-bottom:1px;white-space:nowrap;font-size:10px;">
         *      <a href="http://www.amazon.com/Already-Dead-Joe-Pitt-Novel/product-reviews/034547824X/ref=cm_cr_pr_hist_5?ie=UTF8&filterBy=addFiveStar&showViewpoints=0" style="font-family:Verdana,Arial,Helvetica,Sans-serif;">
         *          5 star
         *      </a>:
         *  </td>
         *  <td style="min-width:60; background-color: #eeeecc" width="60" align="left" class="tiny" title="49%">
         *      <...>
         *  </td>
         *  <td align="right" style="font-family:Verdana,Arial,Helvetica,Sans-serif;;font-size:10px;">
         *      &nbsp;(82)
         *  </td>
         * </tr>
         */
        internal static int GetStarRating(HtmlDocument htmlDoc, Book b, string starNum)
        {
            int rating = 0;

            var nStarNode = htmlDoc.DocumentNode.SelectSingleNode("//a[contains(@href,'filterBy=add" + starNum + "Star')]/../..");

            if (nStarNode != null)
            {
                try
                {
                    //this method works for a review page
                    rating = CrawlUtil.ExtractRatingFromStarString(CrawlUtil.TidyHtmlText(nStarNode.InnerText));

                    //try another method (for detail page)
                    if (rating == -1)
                    {
                        var n = nStarNode.SelectSingleNode("//a[contains(@href,'filterBy=add" + starNum + "Star')]");
                        rating = CrawlUtil.ExtractRatingFromStarStringFromDetailPage(CrawlUtil.TidyHtmlText(n.InnerText));
                    }

                    //if it failed, try another method (for detail page)
                    if (rating == -1)
                    {
                        rating = CrawlUtil.ExtractRatingFromStarStringFromDetailPage(CrawlUtil.TidyHtmlText(nStarNode.InnerText));
                    }
                }
                catch (Exception ex)
                {
                    //old code
                    //ReviewHtmlCrawler.form.appendLineToLog("***ERROR*** " + "\t** [" + starNum + " star rating] not found (" + ex.Message + ")");
                }
            }

            return(rating);
        }
コード例 #3
0
        public static void Run(MainForm parentForm, Book book, bool getRatingStats, bool getReviews)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            GetRatingStats = getRatingStats;
            GetReviews     = getReviews;
            currentBook    = book;
            baseUri        = book.reviewPageURL;

            /*
             * 140185852	We
             * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852
             * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852/ref=cm_cr_pr_btm_link_4?pageSize=50&pageNumber=4&sortBy=recent
             *
             * 618260307	The Hobbit	http://www.amazon.com/The-Hobbit-J-R-R-Tolkien/product-reviews/0618260307
             */
            baseUri += "/ref=cm_cr_pr_btm_link_1?pageSize=50&pageNumber=1";

            if (!currentBook.reviewPageURL.Contains("/ref=cm_cr_pr_btm_link"))
            {
                currentBook.reviewPageURL = baseUri; //hack to make isFirstPage() work [2016-02-04]
            }

            c = new Crawler(new Uri(baseUri),
                            new HtmlDocumentProcessor(), // Process html
                            new ReviewPageProcessStep(),
                            new SaveFileStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;
            c.BeforeDownload    += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload);

            string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);

            //if there are no unblocked user agents left then reset the tracker and retry
            if (ua == null)
            {
                UserAgentTracker = CrawlUtil.InitUserAgentTracker();
                ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);
            }
            c.UserAgent = ua;

            // Begin crawl
            c.Crawl();
        }
コード例 #4
0
        public static void Run(MainForm parentForm, Book book, bool getDetailsAndAuthor, bool getRanks)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            GetDetailsAndAuthor = getDetailsAndAuthor;
            GetRanks            = getRanks;
            currentBook         = book;
            baseUri             = book.detailPageURL;

            Uri u = new Uri(Uri.EscapeUriString(baseUri));

            c = new Crawler(u,
                            new HtmlDocumentProcessor(), // Process html
                            new DetailPageDumperStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;

            string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);

            //if there are no unblocked user agents left then reset the tracker and retry
            if (ua == null)
            {
                UserAgentTracker = CrawlUtil.InitUserAgentTracker();
                ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);
            }
            c.UserAgent = ua;

            // Begin crawl
            c.Crawl();
        }
コード例 #5
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            //using c.MaximumCrawlDepth = 1 means only the details page will be processed (no other links are crawled as they're depth 2)

            DateTime currentDateTime = DateTime.Now;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            BookHtmlCrawler.form.appendLineToLog("Crawling 'details page' for " + BookHtmlCrawler.currentBook.DisplayString);
            BookHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.OriginalString);

            lock (this)
            {
                //check if the correct page is loaded (sometimes Amazon redirects to a robot check
                if (propertyBag.Title.ToLower() == "robot check")
                {
                    BookHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100));

                    //block the user agent that's currently in use
                    BookHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true;

                    //sleep between 30 secs and 3 minutes
                    TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 3 * 60));

                    BookHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds));
                    System.Threading.Thread.Sleep(duration);
                    return;
                }

                string updated = "";
                using (AmazonCrawlerEntities context = DbUtil.getNewContext())
                {
                    Book   b = DbUtil.getBook(context, BookHtmlCrawler.currentBook.ISBN);
                    Author a = null;


                    if (b == null) //this should never happen
                    {
                        BookHtmlCrawler.form.appendLineToLog("[error] ISBN " + BookHtmlCrawler.currentBook.ISBN + " not found in database");
                        return;
                    }

                    if (BookHtmlCrawler.GetDetailsAndAuthor)
                    {
                        //author, language, reading level, format, sales ranks not stored.
                        #region average rating

                        var averageRatingNode = htmlDoc.DocumentNode.SelectSingleNode(@"//div[@id='avgRating']/span/a");
                        if (averageRatingNode != null)
                        {
                            string  ratingText = averageRatingNode.InnerText;
                            decimal rating     = CrawlUtil.ExtractRatingFromSummaryString(CrawlUtil.TidyHtmlText(ratingText));
                            b.avgCustomerReview = rating;
                            BookHtmlCrawler.form.appendLineToLog("\tavg: " + rating);
                        }
                        #endregion

                        #region review page URL
                        string url           = null;
                        var    reviewUrlNode = htmlDoc.DocumentNode.SelectSingleNode(@"//a[@class='a-link-emphasis a-nowrap']/@href");
                        if (reviewUrlNode != null)
                        {
                            var working_url = reviewUrlNode.GetAttributeValue("href", null);

                            if (!string.IsNullOrEmpty(working_url) && working_url.IndexOf("/ref=") > 0)
                            {
                                url = working_url.Substring(0, working_url.IndexOf("/ref="));
                            }
                        }

                        try //save for rating and reviewURL
                        {
                            BookHtmlCrawler.form.appendLineToLog("\treview URL added: " + url);
                            b.reviewPageURL = url ?? "-";

                            context.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            //ignore :(
                            BookHtmlCrawler.form.appendLineToLog(ex.Message);
                        }
                        #endregion

                        //2015-12-01: disabled this feature. It looks like the ratings are now in %s rather than numbers.
                        #region rating stats
                        //TEMP 2014-06-12: get star rating from details page
                        //b.statsCollectedAt = currentDateTime;

                        //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE);
                        //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR);
                        //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE);
                        //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO);
                        //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE);

                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FIVE + " star: " + b.numFiveStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FOUR + " star: " + b.numFourStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.THREE + " star: " + b.numThreeStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.TWO + " star: " + b.numTwoStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.ONE + " star: " + b.numOneStarRatings);

                        //try //save for rating stats
                        //{
                        //    context.SaveChanges();
                        //    updated += "RATING STATS;";
                        //}
                        //catch (Exception ex)
                        //{
                        //    //ignore :(
                        //    ReviewHtmlCrawler.form.appendLineToLog(ex.Message);
                        //}
                        #endregion

                        updated += "DETAILS (avg rating, reviewURL)";
                    }
                }

                BookHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + BookHtmlCrawler.currentBook.DisplayString);
            }
        }
コード例 #6
0
        private void ProcessPage(string filePath)
        {
            DateTime currentDateTime = DateTime.Today;

            HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
            html.Load(filePath);

            HtmlNode doc = html.DocumentNode;

            appendLineToLog("Crawling a 'review page' for " + filePath);

            AmazonCrawlerEntities context = DbUtil.getNewContext();

            HtmlNode isbnNode = doc.SelectSingleNode(".//link[@rel='canonical']/@href");
            //<link rel="canonical" href="http://www.amazon.com/Breaking-Dawn-Twilight-Saga-Book/product-reviews/031606792X?pageNumber=113">
            Match  isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)\?");
            string isbn      = "";

            if (isbnMatch == Match.Empty)
            {
                isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)(/|"")");
            }
            isbn = isbnMatch.Groups[1].Value;

            Book b = DbUtil.getBook(context, isbn);

            if (b == null) //this should never happen
            {
                appendLineToLog("[error] ISBN " + isbn + " not found in database");
                return;
            }

            HtmlNodeCollection reviews = doc.SelectNodes(".//div[@class='a-section review']");

            int numProcessed    = 0;
            int numNotProcessed = 0;
            int numBadges       = 0;

            foreach (HtmlNode r in reviews)
            {
                //id
                string reviewId = r.Id;

                Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId);
                reviewToAdd.Book = b;


                //starRating
                HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]");
                short    rating           = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText);
                reviewToAdd.starRating = rating;


                //reviewTitle
                HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]");
                string   title     = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim());
                reviewToAdd.reviewTitle = title;


                //reviewerId and reviewerName and badges
                HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]");
                if (reviewerNode != null)
                {
                    string profileUrl = reviewerNode.GetAttributeValue("href", "");
                    string reviewerId = "";

                    Match m = Regex.Match(profileUrl, @"profile/(.+)/ref");
                    if (m.Groups.Count == 2)
                    {
                        reviewerId = m.Groups[1].Value;
                    }

                    string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText);

                    Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId);
                    reviewer.reviewerName = reviewerName;
                    reviewer.profileURL   = profileUrl;

                    reviewToAdd.reviewerId   = reviewerId;
                    reviewToAdd.reviewerName = reviewerName;

                    //badges
                    HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]");
                    if (badgeNodes != null)
                    {
                        AmazonCrawlerEntities tmp = DbUtil.getNewContext();

                        foreach (HtmlNode badgeNode in badgeNodes)
                        {
                            try
                            {
                                string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText);

                                ReviewerBadge newBadge = new ReviewerBadge();
                                newBadge.reviewerId       = reviewerId;
                                newBadge.badge            = badgeText;
                                newBadge.statsCollectedAt = currentDateTime;

                                tmp.ReviewerBadges.AddObject(newBadge);
                                tmp.SaveChanges();

                                numBadges += 1;
                            }
                            catch (Exception ex)
                            {
                                //ignore :(
                            }
                        }
                    }
                }

                //publishedDate
                HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]");
                string   dateText       = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText);
                dateText = dateText.Replace("on", "");
                DateTime publishedDate = DateTime.Parse(dateText);
                reviewToAdd.publishedDate = publishedDate;


                //reviewContent
                HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]");
                string   reviewContent  = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText);
                reviewToAdd.reviewContent = reviewContent;

                int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length;

                //numHelpful and numTotal
                //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more)
                HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]");
                if (helpfulNode != null)
                {
                    if (helpfulNode.InnerText.Contains("of"))
                    {
                        int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText);

                        reviewToAdd.numHelpful = helpful[0];
                        reviewToAdd.numTotal   = helpful[1];
                    }
                    else
                    {
                        Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase);
                        if (m.Groups.Count == 2)
                        {
                            reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value);
                        }
                        reviewToAdd.numTotal = null;
                    }
                }

                //numComments
                HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]");
                int      numComments     = 0;
                Int32.TryParse(numCommentsNode.InnerText, out numComments);
                reviewToAdd.numComments = numComments;


                //isAmazonVerifiedPurchase
                HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']");
                bool     isVerifiedPurchase   = false;
                if (verifiedPurchaseNode != null)
                {
                    isVerifiedPurchase = true;
                }
                reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase;

                //format
                HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a");
                if (formatNode != null)
                {
                    string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim();
                    reviewToAdd.format = formatText;
                }

                reviewToAdd.statsCollectedAt = currentDateTime;
                reviewToAdd.reviewType       = "R";

                ////debugging output
                //appendLineToLog("Processing review " + reviewId);
                //appendLineToLog(string.Format("\trating: {0}", rating));
                //appendLineToLog(string.Format("\ttitle: {0}", title));
                //appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName));
                ////badges not output
                //appendLineToLog(string.Format("\tPublished: {0}", publishedDate));
                //appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars)));
                //appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal));
                //appendLineToLog(string.Format("\tCommments: {0}", numComments));
                //appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase));
                //appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format));

                try
                {
                    context.SaveChanges();
                    numProcessed += 1;
                }
                catch (Exception ex)
                {
                    appendLineToLog(ex.Message);
                    appendLineToLog(ex.StackTrace);
                    numNotProcessed += 1;
                }
            }

            appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges));
        }
コード例 #7
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            DateTime currentDateTime = DateTime.Now;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            AuthorHtmlCrawler.form.appendLineToLog("Crawling 'details (rankings) page' for " + AuthorHtmlCrawler.currentAuthor.DisplayString + " [ " + propertyBag.ResponseUri.OriginalString + " ]");

            lock (this)
            {
                AmazonCrawlerEntities context = DbUtil.getNewContext();

                Author a       = DbUtil.getAuthor(context, AuthorHtmlCrawler.currentAuthor.id);
                string updated = "rankings";

                if (a == null) //this should never happen
                {
                    AuthorHtmlCrawler.form.appendLineToLog("[error] author id " + AuthorHtmlCrawler.currentAuthor.id + " not found in database");
                    return;
                }

                //get rankings
                var rankingNodes = htmlDoc.DocumentNode.SelectNodes(".//div[@class='nodeRank']");
                if (rankingNodes != null)
                {
                    foreach (var rankNode in rankingNodes)
                    {
                        try
                        {
                            Ranking r = new Ranking();
                            r.authorId         = a.id;
                            r.statsCollectedAt = currentDateTime;
                            r.rankString       = CrawlUtil.TidyHtmlText(rankNode.InnerText);

                            AuthorHtmlCrawler.form.appendLineToLog(r.rankString);
                            context.AddToRankings(r);
                            context.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            AuthorHtmlCrawler.form.appendLineToLog("**ERROR** " + ex.Message);
                            if (ex.InnerException != null)
                            {
                                AuthorHtmlCrawler.form.appendLineToLog("\t** " + ex.InnerException.Message);
                            }
                        }
                    }
                }
                else
                {
                    try //add a ranking with null rankstring to mark this author has been processed
                    {
                        Ranking r = new Ranking();
                        r.authorId = a.id;

                        context.AddToRankings(r);
                        context.SaveChanges();
                    }
                    catch (Exception ex)
                    {
                    }
                }

                AuthorHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + AuthorHtmlCrawler.currentAuthor.DisplayString);
            }
        }
コード例 #8
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            DateTime currentDateTime = DateTime.Now;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            //check if the correct page is loaded (sometimes Amazon redirects to a robot check
            if (propertyBag.Title.ToLower() == "robot check")
            {
                ReviewHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100));

                //block the user agent that's currently in use
                ReviewHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true;

                //log the page for retry
                File.AppendAllText("REVIEW-to-retry.txt", propertyBag.ResponseUri.OriginalString + System.Environment.NewLine);

                //sleep between 30 secs and 1 minute
                TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 60));

                ReviewHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds));
                System.Threading.Thread.Sleep(duration);
                return;
            }

            ReviewHtmlCrawler.form.appendLineToLog("Crawling a 'review page' for " + ReviewHtmlCrawler.currentBook.DisplayString);
            ReviewHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.ToString());

            AmazonCrawlerEntities context = DbUtil.getNewContext();

            Book b = DbUtil.getBook(context, ReviewHtmlCrawler.currentBook.ISBN);

            if (b == null) //this should never happen
            {
                ReviewHtmlCrawler.form.appendLineToLog("[error] ISBN " + ReviewHtmlCrawler.currentBook.ISBN + " not found in database");
                return;
            }

            //add all other pages of reviews if crawling reviews and on page 1
            //TODO: also get star rating distribution
            if (ReviewHtmlCrawler.GetReviews && isFirstPage(propertyBag))
            {
                ReviewHtmlCrawler.form.appendLineToLog("crawling first page");

                int numPages = CrawlUtil.GetReviewLastPageNumber(htmlDoc.DocumentNode);
                CrawlUtil.AddReviewPagesToCrawl(crawler, propertyBag.ResponseUri.OriginalString, numPages);

                ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages));
                ReviewHtmlCrawler.form.appendLineToLog(string.Format("*** {0} pages to crawl ***", numPages));
                ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages));

                if (ReviewHtmlCrawler.GetRatingStats)
                {
                    #region rating stats
                    //b.statsCollectedAt = currentDateTime;

                    ////as of 2015-12-11 the rating break-down is expressed in percentages rather than numbers.

                    //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE);
                    //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR);
                    //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE);
                    //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO);
                    //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE);

                    //try //save for rating stats
                    //{
                    //    context.SaveChanges();
                    //    updated += "RATING STATS;";
                    //}
                    //catch (Exception ex)
                    //{
                    //    //ignore :(
                    //    ReviewHtmlCrawler.form.appendLineToLog(ex.Message);
                    //}
                    #endregion
                }
            }

            if (ReviewHtmlCrawler.GetReviews)
            {
                HtmlNodeCollection reviews = htmlDoc.DocumentNode.SelectNodes(".//div[@class='a-section review']");

                int numProcessed    = 0;
                int numNotProcessed = 0;
                int numBadges       = 0;

                foreach (HtmlNode r in reviews)
                {
                    //id
                    string reviewId = r.Id;

                    Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId);
                    reviewToAdd.Book = b;


                    //starRating
                    HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]");
                    short    rating           = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText);
                    reviewToAdd.starRating = rating;


                    //reviewTitle
                    HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]");
                    string   title     = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim());
                    reviewToAdd.reviewTitle = title;


                    //reviewerId and reviewerName and badges
                    HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]");
                    if (reviewerNode != null)
                    {
                        string profileUrl = reviewerNode.GetAttributeValue("href", "");
                        string reviewerId = "";

                        Match m = Regex.Match(profileUrl, @"profile/(.+)/ref");
                        if (m.Groups.Count == 2)
                        {
                            reviewerId = m.Groups[1].Value;
                        }

                        string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText);

                        Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId);
                        reviewer.reviewerName = reviewerName;
                        reviewer.profileURL   = profileUrl;

                        reviewToAdd.reviewerId   = reviewerId;
                        reviewToAdd.reviewerName = reviewerName;

                        //badges
                        HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]");
                        if (badgeNodes != null)
                        {
                            AmazonCrawlerEntities tmp = DbUtil.getNewContext();

                            foreach (HtmlNode badgeNode in badgeNodes)
                            {
                                try
                                {
                                    string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText);

                                    ReviewerBadge newBadge = new ReviewerBadge();
                                    newBadge.reviewerId       = reviewerId;
                                    newBadge.badge            = badgeText;
                                    newBadge.statsCollectedAt = currentDateTime;

                                    tmp.ReviewerBadges.AddObject(newBadge);
                                    tmp.SaveChanges();

                                    numBadges += 1;
                                }
                                catch (Exception ex)
                                {
                                    //ignore :(
                                }
                            }
                        }
                    }

                    //publishedDate
                    HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]");
                    string   dateText       = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText);
                    dateText = dateText.Replace("on", "");
                    DateTime publishedDate = DateTime.Parse(dateText);
                    reviewToAdd.publishedDate = publishedDate;


                    //reviewContent
                    HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]");
                    string   reviewContent  = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText);
                    reviewToAdd.reviewContent = reviewContent;

                    int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length;

                    //numHelpful and numTotal
                    //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more)
                    HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]");
                    if (helpfulNode != null)
                    {
                        if (helpfulNode.InnerText.Contains("of"))
                        {
                            int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText);

                            reviewToAdd.numHelpful = helpful[0];
                            reviewToAdd.numTotal   = helpful[1];
                        }
                        else
                        {
                            Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase);
                            if (m.Groups.Count == 2)
                            {
                                reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value);
                            }
                            reviewToAdd.numTotal = null;
                        }
                    }

                    //numComments
                    HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]");
                    int      numComments     = 0;
                    Int32.TryParse(numCommentsNode.InnerText, out numComments);
                    reviewToAdd.numComments = numComments;


                    //isAmazonVerifiedPurchase
                    HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']");
                    bool     isVerifiedPurchase   = false;
                    if (verifiedPurchaseNode != null)
                    {
                        isVerifiedPurchase = true;
                    }
                    reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase;

                    //format
                    HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a");
                    if (formatNode != null)
                    {
                        string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim();
                        reviewToAdd.format = formatText;
                    }

                    reviewToAdd.statsCollectedAt = currentDateTime;
                    reviewToAdd.reviewType       = "R";

                    //debugging output

                    //ReviewHtmlCrawler.form.appendLineToLog("Processing review " + reviewId);
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\trating: {0}", rating));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\ttitle: {0}", title));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName));
                    ////badges not output
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tPublished: {0}", publishedDate));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars)));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tCommments: {0}", numComments));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format));

                    try
                    {
                        context.SaveChanges();
                        numProcessed += 1;
                    }
                    catch (Exception ex)
                    {
                        ReviewHtmlCrawler.form.appendLineToLog(ex.Message);
                        ReviewHtmlCrawler.form.appendLineToLog(ex.StackTrace);
                        numNotProcessed += 1;
                    }
                }

                ReviewHtmlCrawler.form.appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges));
            }
        }