public static void Run(SavePageForm parentForm, string url) { Form = parentForm; Url = url; c = new Crawler(new Uri(url), new HtmlDocumentProcessor(), // Process html new SaveFileStep()); c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; c.CrawlFinished += new EventHandler <NCrawler.Events.CrawlFinishedEventArgs>(c_CrawlFinished); string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); //if there are no unblocked user agents left then reset the tracker and retry if (ua == null) { UserAgentTracker = CrawlUtil.InitUserAgentTracker(); ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); } c.UserAgent = ua; // Begin crawl c.Crawl(); }
/* * [numFiveStarRatings][numFourStarRatings][numThreeStarRatings][numTwoStarRatings][numOneStarRatings] * <tr> * <td align="left" style="padding-right:0.5em;padding-bottom:1px;white-space:nowrap;font-size:10px;"> * <a href="http://www.amazon.com/Already-Dead-Joe-Pitt-Novel/product-reviews/034547824X/ref=cm_cr_pr_hist_5?ie=UTF8&filterBy=addFiveStar&showViewpoints=0" style="font-family:Verdana,Arial,Helvetica,Sans-serif;"> * 5 star * </a>: * </td> * <td style="min-width:60; background-color: #eeeecc" width="60" align="left" class="tiny" title="49%"> * <...> * </td> * <td align="right" style="font-family:Verdana,Arial,Helvetica,Sans-serif;;font-size:10px;"> * (82) * </td> * </tr> */ internal static int GetStarRating(HtmlDocument htmlDoc, Book b, string starNum) { int rating = 0; var nStarNode = htmlDoc.DocumentNode.SelectSingleNode("//a[contains(@href,'filterBy=add" + starNum + "Star')]/../.."); if (nStarNode != null) { try { //this method works for a review page rating = CrawlUtil.ExtractRatingFromStarString(CrawlUtil.TidyHtmlText(nStarNode.InnerText)); //try another method (for detail page) if (rating == -1) { var n = nStarNode.SelectSingleNode("//a[contains(@href,'filterBy=add" + starNum + "Star')]"); rating = CrawlUtil.ExtractRatingFromStarStringFromDetailPage(CrawlUtil.TidyHtmlText(n.InnerText)); } //if it failed, try another method (for detail page) if (rating == -1) { rating = CrawlUtil.ExtractRatingFromStarStringFromDetailPage(CrawlUtil.TidyHtmlText(nStarNode.InnerText)); } } catch (Exception ex) { //old code //ReviewHtmlCrawler.form.appendLineToLog("***ERROR*** " + "\t** [" + starNum + " star rating] not found (" + ex.Message + ")"); } } return(rating); }
public static void Run(MainForm parentForm, Book book, bool getRatingStats, bool getReviews) { form = parentForm; IsolatedStorageModule.Setup(false); GetRatingStats = getRatingStats; GetReviews = getReviews; currentBook = book; baseUri = book.reviewPageURL; /* * 140185852 We * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852 * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852/ref=cm_cr_pr_btm_link_4?pageSize=50&pageNumber=4&sortBy=recent * * 618260307 The Hobbit http://www.amazon.com/The-Hobbit-J-R-R-Tolkien/product-reviews/0618260307 */ baseUri += "/ref=cm_cr_pr_btm_link_1?pageSize=50&pageNumber=1"; if (!currentBook.reviewPageURL.Contains("/ref=cm_cr_pr_btm_link")) { currentBook.reviewPageURL = baseUri; //hack to make isFirstPage() work [2016-02-04] } c = new Crawler(new Uri(baseUri), new HtmlDocumentProcessor(), // Process html new ReviewPageProcessStep(), new SaveFileStep()); // Custom step to visualize crawl c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; c.BeforeDownload += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload); string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); //if there are no unblocked user agents left then reset the tracker and retry if (ua == null) { UserAgentTracker = CrawlUtil.InitUserAgentTracker(); ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); } c.UserAgent = ua; // Begin crawl c.Crawl(); }
public static void Run(MainForm parentForm, Book book, bool getDetailsAndAuthor, bool getRanks) { form = parentForm; IsolatedStorageModule.Setup(false); GetDetailsAndAuthor = getDetailsAndAuthor; GetRanks = getRanks; currentBook = book; baseUri = book.detailPageURL; Uri u = new Uri(Uri.EscapeUriString(baseUri)); c = new Crawler(u, new HtmlDocumentProcessor(), // Process html new DetailPageDumperStep()); // Custom step to visualize crawl c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); //if there are no unblocked user agents left then reset the tracker and retry if (ua == null) { UserAgentTracker = CrawlUtil.InitUserAgentTracker(); ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); } c.UserAgent = ua; // Begin crawl c.Crawl(); }
public void Process(Crawler crawler, PropertyBag propertyBag) { //using c.MaximumCrawlDepth = 1 means only the details page will be processed (no other links are crawled as they're depth 2) DateTime currentDateTime = DateTime.Now; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } BookHtmlCrawler.form.appendLineToLog("Crawling 'details page' for " + BookHtmlCrawler.currentBook.DisplayString); BookHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.OriginalString); lock (this) { //check if the correct page is loaded (sometimes Amazon redirects to a robot check if (propertyBag.Title.ToLower() == "robot check") { BookHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100)); //block the user agent that's currently in use BookHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true; //sleep between 30 secs and 3 minutes TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 3 * 60)); BookHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds)); System.Threading.Thread.Sleep(duration); return; } string updated = ""; using (AmazonCrawlerEntities context = DbUtil.getNewContext()) { Book b = DbUtil.getBook(context, BookHtmlCrawler.currentBook.ISBN); Author a = null; if (b == null) //this should never happen { BookHtmlCrawler.form.appendLineToLog("[error] ISBN " + BookHtmlCrawler.currentBook.ISBN + " not found in database"); return; } if (BookHtmlCrawler.GetDetailsAndAuthor) { //author, language, reading level, format, sales ranks not stored. #region average rating var averageRatingNode = htmlDoc.DocumentNode.SelectSingleNode(@"//div[@id='avgRating']/span/a"); if (averageRatingNode != null) { string ratingText = averageRatingNode.InnerText; decimal rating = CrawlUtil.ExtractRatingFromSummaryString(CrawlUtil.TidyHtmlText(ratingText)); b.avgCustomerReview = rating; BookHtmlCrawler.form.appendLineToLog("\tavg: " + rating); } #endregion #region review page URL string url = null; var reviewUrlNode = htmlDoc.DocumentNode.SelectSingleNode(@"//a[@class='a-link-emphasis a-nowrap']/@href"); if (reviewUrlNode != null) { var working_url = reviewUrlNode.GetAttributeValue("href", null); if (!string.IsNullOrEmpty(working_url) && working_url.IndexOf("/ref=") > 0) { url = working_url.Substring(0, working_url.IndexOf("/ref=")); } } try //save for rating and reviewURL { BookHtmlCrawler.form.appendLineToLog("\treview URL added: " + url); b.reviewPageURL = url ?? "-"; context.SaveChanges(); } catch (Exception ex) { //ignore :( BookHtmlCrawler.form.appendLineToLog(ex.Message); } #endregion //2015-12-01: disabled this feature. It looks like the ratings are now in %s rather than numbers. #region rating stats //TEMP 2014-06-12: get star rating from details page //b.statsCollectedAt = currentDateTime; //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE); //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR); //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE); //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO); //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FIVE + " star: " + b.numFiveStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FOUR + " star: " + b.numFourStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.THREE + " star: " + b.numThreeStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.TWO + " star: " + b.numTwoStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.ONE + " star: " + b.numOneStarRatings); //try //save for rating stats //{ // context.SaveChanges(); // updated += "RATING STATS;"; //} //catch (Exception ex) //{ // //ignore :( // ReviewHtmlCrawler.form.appendLineToLog(ex.Message); //} #endregion updated += "DETAILS (avg rating, reviewURL)"; } } BookHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + BookHtmlCrawler.currentBook.DisplayString); } }
private void ProcessPage(string filePath) { DateTime currentDateTime = DateTime.Today; HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument(); html.Load(filePath); HtmlNode doc = html.DocumentNode; appendLineToLog("Crawling a 'review page' for " + filePath); AmazonCrawlerEntities context = DbUtil.getNewContext(); HtmlNode isbnNode = doc.SelectSingleNode(".//link[@rel='canonical']/@href"); //<link rel="canonical" href="http://www.amazon.com/Breaking-Dawn-Twilight-Saga-Book/product-reviews/031606792X?pageNumber=113"> Match isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)\?"); string isbn = ""; if (isbnMatch == Match.Empty) { isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)(/|"")"); } isbn = isbnMatch.Groups[1].Value; Book b = DbUtil.getBook(context, isbn); if (b == null) //this should never happen { appendLineToLog("[error] ISBN " + isbn + " not found in database"); return; } HtmlNodeCollection reviews = doc.SelectNodes(".//div[@class='a-section review']"); int numProcessed = 0; int numNotProcessed = 0; int numBadges = 0; foreach (HtmlNode r in reviews) { //id string reviewId = r.Id; Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId); reviewToAdd.Book = b; //starRating HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]"); short rating = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText); reviewToAdd.starRating = rating; //reviewTitle HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]"); string title = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim()); reviewToAdd.reviewTitle = title; //reviewerId and reviewerName and badges HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]"); if (reviewerNode != null) { string profileUrl = reviewerNode.GetAttributeValue("href", ""); string reviewerId = ""; Match m = Regex.Match(profileUrl, @"profile/(.+)/ref"); if (m.Groups.Count == 2) { reviewerId = m.Groups[1].Value; } string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText); Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId); reviewer.reviewerName = reviewerName; reviewer.profileURL = profileUrl; reviewToAdd.reviewerId = reviewerId; reviewToAdd.reviewerName = reviewerName; //badges HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]"); if (badgeNodes != null) { AmazonCrawlerEntities tmp = DbUtil.getNewContext(); foreach (HtmlNode badgeNode in badgeNodes) { try { string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText); ReviewerBadge newBadge = new ReviewerBadge(); newBadge.reviewerId = reviewerId; newBadge.badge = badgeText; newBadge.statsCollectedAt = currentDateTime; tmp.ReviewerBadges.AddObject(newBadge); tmp.SaveChanges(); numBadges += 1; } catch (Exception ex) { //ignore :( } } } } //publishedDate HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]"); string dateText = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText); dateText = dateText.Replace("on", ""); DateTime publishedDate = DateTime.Parse(dateText); reviewToAdd.publishedDate = publishedDate; //reviewContent HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]"); string reviewContent = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText); reviewToAdd.reviewContent = reviewContent; int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length; //numHelpful and numTotal //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more) HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]"); if (helpfulNode != null) { if (helpfulNode.InnerText.Contains("of")) { int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText); reviewToAdd.numHelpful = helpful[0]; reviewToAdd.numTotal = helpful[1]; } else { Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase); if (m.Groups.Count == 2) { reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value); } reviewToAdd.numTotal = null; } } //numComments HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]"); int numComments = 0; Int32.TryParse(numCommentsNode.InnerText, out numComments); reviewToAdd.numComments = numComments; //isAmazonVerifiedPurchase HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']"); bool isVerifiedPurchase = false; if (verifiedPurchaseNode != null) { isVerifiedPurchase = true; } reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase; //format HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a"); if (formatNode != null) { string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim(); reviewToAdd.format = formatText; } reviewToAdd.statsCollectedAt = currentDateTime; reviewToAdd.reviewType = "R"; ////debugging output //appendLineToLog("Processing review " + reviewId); //appendLineToLog(string.Format("\trating: {0}", rating)); //appendLineToLog(string.Format("\ttitle: {0}", title)); //appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName)); ////badges not output //appendLineToLog(string.Format("\tPublished: {0}", publishedDate)); //appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars))); //appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal)); //appendLineToLog(string.Format("\tCommments: {0}", numComments)); //appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase)); //appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format)); try { context.SaveChanges(); numProcessed += 1; } catch (Exception ex) { appendLineToLog(ex.Message); appendLineToLog(ex.StackTrace); numNotProcessed += 1; } } appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges)); }
public void Process(Crawler crawler, PropertyBag propertyBag) { DateTime currentDateTime = DateTime.Now; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } AuthorHtmlCrawler.form.appendLineToLog("Crawling 'details (rankings) page' for " + AuthorHtmlCrawler.currentAuthor.DisplayString + " [ " + propertyBag.ResponseUri.OriginalString + " ]"); lock (this) { AmazonCrawlerEntities context = DbUtil.getNewContext(); Author a = DbUtil.getAuthor(context, AuthorHtmlCrawler.currentAuthor.id); string updated = "rankings"; if (a == null) //this should never happen { AuthorHtmlCrawler.form.appendLineToLog("[error] author id " + AuthorHtmlCrawler.currentAuthor.id + " not found in database"); return; } //get rankings var rankingNodes = htmlDoc.DocumentNode.SelectNodes(".//div[@class='nodeRank']"); if (rankingNodes != null) { foreach (var rankNode in rankingNodes) { try { Ranking r = new Ranking(); r.authorId = a.id; r.statsCollectedAt = currentDateTime; r.rankString = CrawlUtil.TidyHtmlText(rankNode.InnerText); AuthorHtmlCrawler.form.appendLineToLog(r.rankString); context.AddToRankings(r); context.SaveChanges(); } catch (Exception ex) { AuthorHtmlCrawler.form.appendLineToLog("**ERROR** " + ex.Message); if (ex.InnerException != null) { AuthorHtmlCrawler.form.appendLineToLog("\t** " + ex.InnerException.Message); } } } } else { try //add a ranking with null rankstring to mark this author has been processed { Ranking r = new Ranking(); r.authorId = a.id; context.AddToRankings(r); context.SaveChanges(); } catch (Exception ex) { } } AuthorHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + AuthorHtmlCrawler.currentAuthor.DisplayString); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { DateTime currentDateTime = DateTime.Now; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } //check if the correct page is loaded (sometimes Amazon redirects to a robot check if (propertyBag.Title.ToLower() == "robot check") { ReviewHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100)); //block the user agent that's currently in use ReviewHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true; //log the page for retry File.AppendAllText("REVIEW-to-retry.txt", propertyBag.ResponseUri.OriginalString + System.Environment.NewLine); //sleep between 30 secs and 1 minute TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 60)); ReviewHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds)); System.Threading.Thread.Sleep(duration); return; } ReviewHtmlCrawler.form.appendLineToLog("Crawling a 'review page' for " + ReviewHtmlCrawler.currentBook.DisplayString); ReviewHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.ToString()); AmazonCrawlerEntities context = DbUtil.getNewContext(); Book b = DbUtil.getBook(context, ReviewHtmlCrawler.currentBook.ISBN); if (b == null) //this should never happen { ReviewHtmlCrawler.form.appendLineToLog("[error] ISBN " + ReviewHtmlCrawler.currentBook.ISBN + " not found in database"); return; } //add all other pages of reviews if crawling reviews and on page 1 //TODO: also get star rating distribution if (ReviewHtmlCrawler.GetReviews && isFirstPage(propertyBag)) { ReviewHtmlCrawler.form.appendLineToLog("crawling first page"); int numPages = CrawlUtil.GetReviewLastPageNumber(htmlDoc.DocumentNode); CrawlUtil.AddReviewPagesToCrawl(crawler, propertyBag.ResponseUri.OriginalString, numPages); ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages)); ReviewHtmlCrawler.form.appendLineToLog(string.Format("*** {0} pages to crawl ***", numPages)); ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages)); if (ReviewHtmlCrawler.GetRatingStats) { #region rating stats //b.statsCollectedAt = currentDateTime; ////as of 2015-12-11 the rating break-down is expressed in percentages rather than numbers. //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE); //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR); //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE); //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO); //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE); //try //save for rating stats //{ // context.SaveChanges(); // updated += "RATING STATS;"; //} //catch (Exception ex) //{ // //ignore :( // ReviewHtmlCrawler.form.appendLineToLog(ex.Message); //} #endregion } } if (ReviewHtmlCrawler.GetReviews) { HtmlNodeCollection reviews = htmlDoc.DocumentNode.SelectNodes(".//div[@class='a-section review']"); int numProcessed = 0; int numNotProcessed = 0; int numBadges = 0; foreach (HtmlNode r in reviews) { //id string reviewId = r.Id; Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId); reviewToAdd.Book = b; //starRating HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]"); short rating = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText); reviewToAdd.starRating = rating; //reviewTitle HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]"); string title = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim()); reviewToAdd.reviewTitle = title; //reviewerId and reviewerName and badges HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]"); if (reviewerNode != null) { string profileUrl = reviewerNode.GetAttributeValue("href", ""); string reviewerId = ""; Match m = Regex.Match(profileUrl, @"profile/(.+)/ref"); if (m.Groups.Count == 2) { reviewerId = m.Groups[1].Value; } string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText); Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId); reviewer.reviewerName = reviewerName; reviewer.profileURL = profileUrl; reviewToAdd.reviewerId = reviewerId; reviewToAdd.reviewerName = reviewerName; //badges HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]"); if (badgeNodes != null) { AmazonCrawlerEntities tmp = DbUtil.getNewContext(); foreach (HtmlNode badgeNode in badgeNodes) { try { string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText); ReviewerBadge newBadge = new ReviewerBadge(); newBadge.reviewerId = reviewerId; newBadge.badge = badgeText; newBadge.statsCollectedAt = currentDateTime; tmp.ReviewerBadges.AddObject(newBadge); tmp.SaveChanges(); numBadges += 1; } catch (Exception ex) { //ignore :( } } } } //publishedDate HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]"); string dateText = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText); dateText = dateText.Replace("on", ""); DateTime publishedDate = DateTime.Parse(dateText); reviewToAdd.publishedDate = publishedDate; //reviewContent HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]"); string reviewContent = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText); reviewToAdd.reviewContent = reviewContent; int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length; //numHelpful and numTotal //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more) HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]"); if (helpfulNode != null) { if (helpfulNode.InnerText.Contains("of")) { int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText); reviewToAdd.numHelpful = helpful[0]; reviewToAdd.numTotal = helpful[1]; } else { Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase); if (m.Groups.Count == 2) { reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value); } reviewToAdd.numTotal = null; } } //numComments HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]"); int numComments = 0; Int32.TryParse(numCommentsNode.InnerText, out numComments); reviewToAdd.numComments = numComments; //isAmazonVerifiedPurchase HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']"); bool isVerifiedPurchase = false; if (verifiedPurchaseNode != null) { isVerifiedPurchase = true; } reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase; //format HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a"); if (formatNode != null) { string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim(); reviewToAdd.format = formatText; } reviewToAdd.statsCollectedAt = currentDateTime; reviewToAdd.reviewType = "R"; //debugging output //ReviewHtmlCrawler.form.appendLineToLog("Processing review " + reviewId); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\trating: {0}", rating)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\ttitle: {0}", title)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName)); ////badges not output //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tPublished: {0}", publishedDate)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars))); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tCommments: {0}", numComments)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format)); try { context.SaveChanges(); numProcessed += 1; } catch (Exception ex) { ReviewHtmlCrawler.form.appendLineToLog(ex.Message); ReviewHtmlCrawler.form.appendLineToLog(ex.StackTrace); numNotProcessed += 1; } } ReviewHtmlCrawler.form.appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges)); } }