/* * [numFiveStarRatings][numFourStarRatings][numThreeStarRatings][numTwoStarRatings][numOneStarRatings] * <tr> * <td align="left" style="padding-right:0.5em;padding-bottom:1px;white-space:nowrap;font-size:10px;"> * <a href="http://www.amazon.com/Already-Dead-Joe-Pitt-Novel/product-reviews/034547824X/ref=cm_cr_pr_hist_5?ie=UTF8&filterBy=addFiveStar&showViewpoints=0" style="font-family:Verdana,Arial,Helvetica,Sans-serif;"> * 5 star * </a>: * </td> * <td style="min-width:60; background-color: #eeeecc" width="60" align="left" class="tiny" title="49%"> * <...> * </td> * <td align="right" style="font-family:Verdana,Arial,Helvetica,Sans-serif;;font-size:10px;"> * (82) * </td> * </tr> */ internal static int GetStarRating(HtmlDocument htmlDoc, Book b, string starNum) { int rating = 0; var nStarNode = htmlDoc.DocumentNode.SelectSingleNode("//a[contains(@href,'filterBy=add" + starNum + "Star')]/../.."); if (nStarNode != null) { try { //this method works for a review page rating = CrawlUtil.ExtractRatingFromStarString(CrawlUtil.TidyHtmlText(nStarNode.InnerText)); //try another method (for detail page) if (rating == -1) { var n = nStarNode.SelectSingleNode("//a[contains(@href,'filterBy=add" + starNum + "Star')]"); rating = CrawlUtil.ExtractRatingFromStarStringFromDetailPage(CrawlUtil.TidyHtmlText(n.InnerText)); } //if it failed, try another method (for detail page) if (rating == -1) { rating = CrawlUtil.ExtractRatingFromStarStringFromDetailPage(CrawlUtil.TidyHtmlText(nStarNode.InnerText)); } } catch (Exception ex) { //old code //ReviewHtmlCrawler.form.appendLineToLog("***ERROR*** " + "\t** [" + starNum + " star rating] not found (" + ex.Message + ")"); } } return(rating); }
public void Process(Crawler crawler, PropertyBag propertyBag) { //using c.MaximumCrawlDepth = 1 means only the details page will be processed (no other links are crawled as they're depth 2) DateTime currentDateTime = DateTime.Now; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } BookHtmlCrawler.form.appendLineToLog("Crawling 'details page' for " + BookHtmlCrawler.currentBook.DisplayString); BookHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.OriginalString); lock (this) { //check if the correct page is loaded (sometimes Amazon redirects to a robot check if (propertyBag.Title.ToLower() == "robot check") { BookHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100)); //block the user agent that's currently in use BookHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true; //sleep between 30 secs and 3 minutes TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 3 * 60)); BookHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds)); System.Threading.Thread.Sleep(duration); return; } string updated = ""; using (AmazonCrawlerEntities context = DbUtil.getNewContext()) { Book b = DbUtil.getBook(context, BookHtmlCrawler.currentBook.ISBN); Author a = null; if (b == null) //this should never happen { BookHtmlCrawler.form.appendLineToLog("[error] ISBN " + BookHtmlCrawler.currentBook.ISBN + " not found in database"); return; } if (BookHtmlCrawler.GetDetailsAndAuthor) { //author, language, reading level, format, sales ranks not stored. #region average rating var averageRatingNode = htmlDoc.DocumentNode.SelectSingleNode(@"//div[@id='avgRating']/span/a"); if (averageRatingNode != null) { string ratingText = averageRatingNode.InnerText; decimal rating = CrawlUtil.ExtractRatingFromSummaryString(CrawlUtil.TidyHtmlText(ratingText)); b.avgCustomerReview = rating; BookHtmlCrawler.form.appendLineToLog("\tavg: " + rating); } #endregion #region review page URL string url = null; var reviewUrlNode = htmlDoc.DocumentNode.SelectSingleNode(@"//a[@class='a-link-emphasis a-nowrap']/@href"); if (reviewUrlNode != null) { var working_url = reviewUrlNode.GetAttributeValue("href", null); if (!string.IsNullOrEmpty(working_url) && working_url.IndexOf("/ref=") > 0) { url = working_url.Substring(0, working_url.IndexOf("/ref=")); } } try //save for rating and reviewURL { BookHtmlCrawler.form.appendLineToLog("\treview URL added: " + url); b.reviewPageURL = url ?? "-"; context.SaveChanges(); } catch (Exception ex) { //ignore :( BookHtmlCrawler.form.appendLineToLog(ex.Message); } #endregion //2015-12-01: disabled this feature. It looks like the ratings are now in %s rather than numbers. #region rating stats //TEMP 2014-06-12: get star rating from details page //b.statsCollectedAt = currentDateTime; //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE); //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR); //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE); //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO); //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FIVE + " star: " + b.numFiveStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FOUR + " star: " + b.numFourStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.THREE + " star: " + b.numThreeStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.TWO + " star: " + b.numTwoStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.ONE + " star: " + b.numOneStarRatings); //try //save for rating stats //{ // context.SaveChanges(); // updated += "RATING STATS;"; //} //catch (Exception ex) //{ // //ignore :( // ReviewHtmlCrawler.form.appendLineToLog(ex.Message); //} #endregion updated += "DETAILS (avg rating, reviewURL)"; } } BookHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + BookHtmlCrawler.currentBook.DisplayString); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { DateTime currentDateTime = DateTime.Now; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } AuthorHtmlCrawler.form.appendLineToLog("Crawling 'details (rankings) page' for " + AuthorHtmlCrawler.currentAuthor.DisplayString + " [ " + propertyBag.ResponseUri.OriginalString + " ]"); lock (this) { AmazonCrawlerEntities context = DbUtil.getNewContext(); Author a = DbUtil.getAuthor(context, AuthorHtmlCrawler.currentAuthor.id); string updated = "rankings"; if (a == null) //this should never happen { AuthorHtmlCrawler.form.appendLineToLog("[error] author id " + AuthorHtmlCrawler.currentAuthor.id + " not found in database"); return; } //get rankings var rankingNodes = htmlDoc.DocumentNode.SelectNodes(".//div[@class='nodeRank']"); if (rankingNodes != null) { foreach (var rankNode in rankingNodes) { try { Ranking r = new Ranking(); r.authorId = a.id; r.statsCollectedAt = currentDateTime; r.rankString = CrawlUtil.TidyHtmlText(rankNode.InnerText); AuthorHtmlCrawler.form.appendLineToLog(r.rankString); context.AddToRankings(r); context.SaveChanges(); } catch (Exception ex) { AuthorHtmlCrawler.form.appendLineToLog("**ERROR** " + ex.Message); if (ex.InnerException != null) { AuthorHtmlCrawler.form.appendLineToLog("\t** " + ex.InnerException.Message); } } } } else { try //add a ranking with null rankstring to mark this author has been processed { Ranking r = new Ranking(); r.authorId = a.id; context.AddToRankings(r); context.SaveChanges(); } catch (Exception ex) { } } AuthorHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + AuthorHtmlCrawler.currentAuthor.DisplayString); } }