Ejemplo n.º 1
0
        private void ProcessPage(string filePath)
        {
            DateTime currentDateTime = DateTime.Today;

            HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
            html.Load(filePath);

            HtmlNode doc = html.DocumentNode;

            appendLineToLog("Crawling a 'review page' for " + filePath);

            AmazonCrawlerEntities context = DbUtil.getNewContext();

            HtmlNode isbnNode = doc.SelectSingleNode(".//link[@rel='canonical']/@href");
            //<link rel="canonical" href="http://www.amazon.com/Breaking-Dawn-Twilight-Saga-Book/product-reviews/031606792X?pageNumber=113">
            Match  isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)\?");
            string isbn      = "";

            if (isbnMatch == Match.Empty)
            {
                isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)(/|"")");
            }
            isbn = isbnMatch.Groups[1].Value;

            Book b = DbUtil.getBook(context, isbn);

            if (b == null) //this should never happen
            {
                appendLineToLog("[error] ISBN " + isbn + " not found in database");
                return;
            }

            HtmlNodeCollection reviews = doc.SelectNodes(".//div[@class='a-section review']");

            int numProcessed    = 0;
            int numNotProcessed = 0;
            int numBadges       = 0;

            foreach (HtmlNode r in reviews)
            {
                //id
                string reviewId = r.Id;

                Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId);
                reviewToAdd.Book = b;


                //starRating
                HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]");
                short    rating           = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText);
                reviewToAdd.starRating = rating;


                //reviewTitle
                HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]");
                string   title     = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim());
                reviewToAdd.reviewTitle = title;


                //reviewerId and reviewerName and badges
                HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]");
                if (reviewerNode != null)
                {
                    string profileUrl = reviewerNode.GetAttributeValue("href", "");
                    string reviewerId = "";

                    Match m = Regex.Match(profileUrl, @"profile/(.+)/ref");
                    if (m.Groups.Count == 2)
                    {
                        reviewerId = m.Groups[1].Value;
                    }

                    string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText);

                    Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId);
                    reviewer.reviewerName = reviewerName;
                    reviewer.profileURL   = profileUrl;

                    reviewToAdd.reviewerId   = reviewerId;
                    reviewToAdd.reviewerName = reviewerName;

                    //badges
                    HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]");
                    if (badgeNodes != null)
                    {
                        AmazonCrawlerEntities tmp = DbUtil.getNewContext();

                        foreach (HtmlNode badgeNode in badgeNodes)
                        {
                            try
                            {
                                string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText);

                                ReviewerBadge newBadge = new ReviewerBadge();
                                newBadge.reviewerId       = reviewerId;
                                newBadge.badge            = badgeText;
                                newBadge.statsCollectedAt = currentDateTime;

                                tmp.ReviewerBadges.AddObject(newBadge);
                                tmp.SaveChanges();

                                numBadges += 1;
                            }
                            catch (Exception ex)
                            {
                                //ignore :(
                            }
                        }
                    }
                }

                //publishedDate
                HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]");
                string   dateText       = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText);
                dateText = dateText.Replace("on", "");
                DateTime publishedDate = DateTime.Parse(dateText);
                reviewToAdd.publishedDate = publishedDate;


                //reviewContent
                HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]");
                string   reviewContent  = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText);
                reviewToAdd.reviewContent = reviewContent;

                int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length;

                //numHelpful and numTotal
                //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more)
                HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]");
                if (helpfulNode != null)
                {
                    if (helpfulNode.InnerText.Contains("of"))
                    {
                        int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText);

                        reviewToAdd.numHelpful = helpful[0];
                        reviewToAdd.numTotal   = helpful[1];
                    }
                    else
                    {
                        Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase);
                        if (m.Groups.Count == 2)
                        {
                            reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value);
                        }
                        reviewToAdd.numTotal = null;
                    }
                }

                //numComments
                HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]");
                int      numComments     = 0;
                Int32.TryParse(numCommentsNode.InnerText, out numComments);
                reviewToAdd.numComments = numComments;


                //isAmazonVerifiedPurchase
                HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']");
                bool     isVerifiedPurchase   = false;
                if (verifiedPurchaseNode != null)
                {
                    isVerifiedPurchase = true;
                }
                reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase;

                //format
                HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a");
                if (formatNode != null)
                {
                    string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim();
                    reviewToAdd.format = formatText;
                }

                reviewToAdd.statsCollectedAt = currentDateTime;
                reviewToAdd.reviewType       = "R";

                ////debugging output
                //appendLineToLog("Processing review " + reviewId);
                //appendLineToLog(string.Format("\trating: {0}", rating));
                //appendLineToLog(string.Format("\ttitle: {0}", title));
                //appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName));
                ////badges not output
                //appendLineToLog(string.Format("\tPublished: {0}", publishedDate));
                //appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars)));
                //appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal));
                //appendLineToLog(string.Format("\tCommments: {0}", numComments));
                //appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase));
                //appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format));

                try
                {
                    context.SaveChanges();
                    numProcessed += 1;
                }
                catch (Exception ex)
                {
                    appendLineToLog(ex.Message);
                    appendLineToLog(ex.StackTrace);
                    numNotProcessed += 1;
                }
            }

            appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges));
        }
Ejemplo n.º 2
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            DateTime currentDateTime = DateTime.Now;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            //check if the correct page is loaded (sometimes Amazon redirects to a robot check
            if (propertyBag.Title.ToLower() == "robot check")
            {
                ReviewHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100));

                //block the user agent that's currently in use
                ReviewHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true;

                //log the page for retry
                File.AppendAllText("REVIEW-to-retry.txt", propertyBag.ResponseUri.OriginalString + System.Environment.NewLine);

                //sleep between 30 secs and 1 minute
                TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 60));

                ReviewHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds));
                System.Threading.Thread.Sleep(duration);
                return;
            }

            ReviewHtmlCrawler.form.appendLineToLog("Crawling a 'review page' for " + ReviewHtmlCrawler.currentBook.DisplayString);
            ReviewHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.ToString());

            AmazonCrawlerEntities context = DbUtil.getNewContext();

            Book b = DbUtil.getBook(context, ReviewHtmlCrawler.currentBook.ISBN);

            if (b == null) //this should never happen
            {
                ReviewHtmlCrawler.form.appendLineToLog("[error] ISBN " + ReviewHtmlCrawler.currentBook.ISBN + " not found in database");
                return;
            }

            //add all other pages of reviews if crawling reviews and on page 1
            //TODO: also get star rating distribution
            if (ReviewHtmlCrawler.GetReviews && isFirstPage(propertyBag))
            {
                ReviewHtmlCrawler.form.appendLineToLog("crawling first page");

                int numPages = CrawlUtil.GetReviewLastPageNumber(htmlDoc.DocumentNode);
                CrawlUtil.AddReviewPagesToCrawl(crawler, propertyBag.ResponseUri.OriginalString, numPages);

                ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages));
                ReviewHtmlCrawler.form.appendLineToLog(string.Format("*** {0} pages to crawl ***", numPages));
                ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages));

                if (ReviewHtmlCrawler.GetRatingStats)
                {
                    #region rating stats
                    //b.statsCollectedAt = currentDateTime;

                    ////as of 2015-12-11 the rating break-down is expressed in percentages rather than numbers.

                    //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE);
                    //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR);
                    //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE);
                    //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO);
                    //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE);

                    //try //save for rating stats
                    //{
                    //    context.SaveChanges();
                    //    updated += "RATING STATS;";
                    //}
                    //catch (Exception ex)
                    //{
                    //    //ignore :(
                    //    ReviewHtmlCrawler.form.appendLineToLog(ex.Message);
                    //}
                    #endregion
                }
            }

            if (ReviewHtmlCrawler.GetReviews)
            {
                HtmlNodeCollection reviews = htmlDoc.DocumentNode.SelectNodes(".//div[@class='a-section review']");

                int numProcessed    = 0;
                int numNotProcessed = 0;
                int numBadges       = 0;

                foreach (HtmlNode r in reviews)
                {
                    //id
                    string reviewId = r.Id;

                    Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId);
                    reviewToAdd.Book = b;


                    //starRating
                    HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]");
                    short    rating           = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText);
                    reviewToAdd.starRating = rating;


                    //reviewTitle
                    HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]");
                    string   title     = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim());
                    reviewToAdd.reviewTitle = title;


                    //reviewerId and reviewerName and badges
                    HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]");
                    if (reviewerNode != null)
                    {
                        string profileUrl = reviewerNode.GetAttributeValue("href", "");
                        string reviewerId = "";

                        Match m = Regex.Match(profileUrl, @"profile/(.+)/ref");
                        if (m.Groups.Count == 2)
                        {
                            reviewerId = m.Groups[1].Value;
                        }

                        string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText);

                        Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId);
                        reviewer.reviewerName = reviewerName;
                        reviewer.profileURL   = profileUrl;

                        reviewToAdd.reviewerId   = reviewerId;
                        reviewToAdd.reviewerName = reviewerName;

                        //badges
                        HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]");
                        if (badgeNodes != null)
                        {
                            AmazonCrawlerEntities tmp = DbUtil.getNewContext();

                            foreach (HtmlNode badgeNode in badgeNodes)
                            {
                                try
                                {
                                    string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText);

                                    ReviewerBadge newBadge = new ReviewerBadge();
                                    newBadge.reviewerId       = reviewerId;
                                    newBadge.badge            = badgeText;
                                    newBadge.statsCollectedAt = currentDateTime;

                                    tmp.ReviewerBadges.AddObject(newBadge);
                                    tmp.SaveChanges();

                                    numBadges += 1;
                                }
                                catch (Exception ex)
                                {
                                    //ignore :(
                                }
                            }
                        }
                    }

                    //publishedDate
                    HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]");
                    string   dateText       = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText);
                    dateText = dateText.Replace("on", "");
                    DateTime publishedDate = DateTime.Parse(dateText);
                    reviewToAdd.publishedDate = publishedDate;


                    //reviewContent
                    HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]");
                    string   reviewContent  = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText);
                    reviewToAdd.reviewContent = reviewContent;

                    int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length;

                    //numHelpful and numTotal
                    //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more)
                    HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]");
                    if (helpfulNode != null)
                    {
                        if (helpfulNode.InnerText.Contains("of"))
                        {
                            int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText);

                            reviewToAdd.numHelpful = helpful[0];
                            reviewToAdd.numTotal   = helpful[1];
                        }
                        else
                        {
                            Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase);
                            if (m.Groups.Count == 2)
                            {
                                reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value);
                            }
                            reviewToAdd.numTotal = null;
                        }
                    }

                    //numComments
                    HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]");
                    int      numComments     = 0;
                    Int32.TryParse(numCommentsNode.InnerText, out numComments);
                    reviewToAdd.numComments = numComments;


                    //isAmazonVerifiedPurchase
                    HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']");
                    bool     isVerifiedPurchase   = false;
                    if (verifiedPurchaseNode != null)
                    {
                        isVerifiedPurchase = true;
                    }
                    reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase;

                    //format
                    HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a");
                    if (formatNode != null)
                    {
                        string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim();
                        reviewToAdd.format = formatText;
                    }

                    reviewToAdd.statsCollectedAt = currentDateTime;
                    reviewToAdd.reviewType       = "R";

                    //debugging output

                    //ReviewHtmlCrawler.form.appendLineToLog("Processing review " + reviewId);
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\trating: {0}", rating));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\ttitle: {0}", title));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName));
                    ////badges not output
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tPublished: {0}", publishedDate));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars)));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tCommments: {0}", numComments));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format));

                    try
                    {
                        context.SaveChanges();
                        numProcessed += 1;
                    }
                    catch (Exception ex)
                    {
                        ReviewHtmlCrawler.form.appendLineToLog(ex.Message);
                        ReviewHtmlCrawler.form.appendLineToLog(ex.StackTrace);
                        numNotProcessed += 1;
                    }
                }

                ReviewHtmlCrawler.form.appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges));
            }
        }