public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; string temp = propertyBag.ResponseUri.OriginalString; string temp2 = CrawlReviewsOnUserProfile.baseUri.Substring(0, 54 + User.userIdString.Length); //on page 1, add other pages for user if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile.baseUri) { var node = doc.SelectSingleNode(".//div[@id='reviewPagination']"); if (node != null) { try { var x = node.LastChild.PreviousSibling.PreviousSibling; int maxPage = Int32.Parse(x.InnerText.Trim()); string uri; for (int i = 2; i <= maxPage; i++) { uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?format=html&page=" + i + "&per_page=100&shelf=read"; crawler.AddStep(new Uri(uri), 0); CrawlReviewsOnUserProfile.form.appendLineToLog(uri); } } catch (Exception) { } } } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']")) { CrawlReviewsOnUserProfile.count++; string reviewIdString = reviewNode.GetAttributeValue("id", ""); if (reviewIdString == "") { return; } int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString()); Review review = CrawlUtil.getReview(context, reviewId); //create and process the REVIEW if it doesn't already exist if (review == null) { HtmlNode node; review = new Review(); review.id = reviewId; //REVIEW.rating node = reviewNode.SelectSingleNode(".//td[@class='field rating']//img"); if (node != null) { string ratingString = node.GetAttributeValue("alt", "0"); short rating = short.Parse(ratingString.Substring(0, 1)); review.starRating = rating; } //REVIEW.publishdate node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span"); if (node != null) { DateTime date; DateTime.TryParse(node.InnerText, out date); review.publishDate = date; } //USER review.userId = User.id; review.userIdString = User.userIdString; //BOOK node = reviewNode.SelectSingleNode(".//td[@class='field title']//a"); string bookUrl = node.GetAttributeValue("href", ""); int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0 Book book = CrawlUtil.getBook(context, bookId); if (book == null) { book = new Book(); book.id = bookId; string title = node.GetAttributeValue("title", ""); book.title = title; node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div"); if (node != null) { book.isbn = node.InnerText.Trim(); } //AUTHOR node = reviewNode.SelectSingleNode(".//td[@class='field author']//a"); if (node != null) { string authorUrl = node.GetAttributeValue("href", ""); int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0 Author author = CrawlUtil.getAuthor(context, authorId); if (author == null) { author = new Author(); author.id = authorId; author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim()); book.Author = author; } } } review.Book = book; context.SaveChanges(); } } CrawlReviewsOnUserProfile.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile.count + " reviews crawled"); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } //only process the review page if (propertyBag.ResponseUri.OriginalString != CrawlReviews.baseUri) { return; } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); Review review = CrawlUtil.getReview(context, ReviewId); if (review == null) { return; //this should never happen } //userId int Checked //userIdString varchar(256) Checked //reviewContent varchar(MAX) Checked //starRating smallint Checked //publishDate date Checked //recommendedFor varchar(MAX) Checked //numComments int Checked HtmlNode doc = htmlDoc.DocumentNode; int userId = -1; string userIdString = null; string userName = null; string reviewContent = null; short starRating = -1; DateTime publishDate; //string recommendedFor = null; int numComments = -1; var reviewAuthorNode = doc.SelectSingleNode("//a[@itemprop='author']"); if (reviewAuthorNode != null) { userIdString = reviewAuthorNode.GetAttributeValue("href", null); userIdString = CrawlUtil.extractUserIdStringFromUrl(userIdString); //null if error userId = CrawlUtil.extractUserIdFromString(userIdString); //-1 or 0 if error userName = reviewAuthorNode.InnerText.Trim(); //empty if error if (userIdString != null && userId > 0 && !String.IsNullOrEmpty(userName)) { var user = CrawlUtil.createOrGetUser(context, userId, userIdString, userName); review.User = user; review.userIdString = userIdString; } //<a href="/user/show/52663-mer" class="userReview" itemprop="author">Mer</a> } var reviewContentNode = doc.SelectSingleNode("//div[@itemprop='reviewBody']"); if (reviewContentNode != null) { review.reviewContent = reviewContentNode.InnerText.Trim(); } var starRatingNode = doc.SelectSingleNode("//div[@class='rating']/span[@class='value-title']"); if (starRatingNode != null) { short.TryParse(starRatingNode.GetAttributeValue("title", ""), out starRating); review.starRating = starRating; //<span class="value-title" title="5"></span> } var publishDateNode = doc.SelectSingleNode("//div[@class='right dtreviewed greyText smallText']/span[@class='value-title']"); if (publishDateNode != null) { DateTime.TryParse(publishDateNode.GetAttributeValue("title", ""), out publishDate); review.publishDate = publishDate; //<span class="value-title" title="2007-04-28"></span> } var recomendedForNode = doc.SelectSingleNode("//span[text()='Recommended for:']"); if (recomendedForNode != null) { review.recommendedFor = recomendedForNode.ParentNode.LastChild.InnerText.Trim(); /* * <div> * <span class="">Recommended for:</span> * enviornmentalists, nurturers, parents and children who want to discuss empathy and reciprocity * </div> */ } //var numCommentsNode = doc.SelectSingleNode(""); //if (numCommentsNode != null) //{ // /* // * <h2 class="brownBackground"> // * <div class="extraHeader"> // * <a href="#comment_form" rel="nofollow">Post a comment »</a> // * </div>Comments // * <span class="smallText"> // (showing // 1-5 // of // 5) // </span> // * ... // * </h2> // */ //} context.SaveChanges(); CrawlReviews.form.appendLineToLog("Added review " + review.id + " by user " + review.User.name); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; string temp = propertyBag.ResponseUri.OriginalString; string temp2 = CrawlReviewsOnUserProfile_Updated.baseUri.Substring(0, 54 + User.userIdString.Length); //on page 1, add other pages for user if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile_Updated.baseUri) { var node = doc.SelectSingleNode(".//div[@id='reviewPagination']"); if (node != null) { HtmlNode maxPageNode; int maxPage = 0; try { maxPageNode = node.LastChild.PreviousSibling.PreviousSibling; maxPage = Int32.Parse(maxPageNode.InnerText.Trim()); } catch (Exception) { CrawlReviewsOnUserProfile_Updated.form.appendLineToLog("Error getting maxPage on " + propertyBag.ResponseUri.OriginalString); return; } //get new reviews since last crawl? int pagesToCrawl = 0; if (CrawlReviewsOnUserProfile_Updated.getReviewsSinceLastCrawl) { pagesToCrawl = maxPage - (User.Reviews.Count / 20); //int division results in truncation if (pagesToCrawl < 1) { return; } /**** TEMP to get pages 30 and above (for users with more than 600 reviews (after getting up to 600 only on a previous run))****/ //for (int i = 30; i <= maxPage; i++) //{ // String s= "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews"; // crawler.AddStep(new Uri(s), 0); // CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(s); //} //return; /*** Old logic pre 2015 11 30 *** * int startPage = (User.Reviews.Count / 20)+1; * string uri; * for (int i = startPage; i <= maxPage; i++) * { * uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews"; * crawler.AddStep(new Uri(uri), 0); * * CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri); * } * return; * *************/ } else //crawl every page { pagesToCrawl = maxPage; } string uri; for (int i = 2; i <= pagesToCrawl; i++) { //http://www.goodreads.com/review/list/1-otis-chandler?page=3&print=true&shelf=read uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?page=" + i + "&print=true&shelf=read"; crawler.AddStep(new Uri(uri), 0); CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri); } //continue to with crawl on page 1 unless endPage is 0 (i.e. no pages need to be crawled) //2015-11-30: getting X latest pages is now redudant since reviews are now always sorted by date added ascending. //feature removed for 2015 update 5 crawl //get reviews from specified pages; or latest X pages //get user's latest X pages of reviews if user has more than (maxPage * 20) reviews; X determined by getHowManyLatestPages //if (maxPage > CrawlReviewsOnUserProfile_Updated.maxPage) //{ // if (CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages > 0) // { // int numLatestPages = Math.Min(CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages, maxPage - CrawlReviewsOnUserProfile_Updated.maxPage); // for (int i = 0; i < numLatestPages; i++) // { // string uriLatest = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + (maxPage - i) + "&print=true&shelf=read&sort=date_added&view=reviews"; // crawler.AddStep(new Uri(uriLatest), 0); // CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uriLatest); // } // } // maxPage = CrawlReviewsOnUserProfile_Updated.maxPage; //} //string u; //for (int i = CrawlReviewsOnUserProfile_Updated.minPage; i <= maxPage; i++) //{ // u = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews"; // crawler.AddStep(new Uri(u), 0); // CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(u); //} //if don't want to include page 1 then don't crawl after adding other pages to crawl //if (CrawlReviewsOnUserProfile_Updated.minPage > 1) //{ // return; //} } } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']")) { string reviewIdString = reviewNode.GetAttributeValue("id", ""); if (reviewIdString == "") { return; } int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString()); //Review review = CrawlUtil.createOrGetReview(context, reviewId); Review review = CrawlUtil.getReview(context, reviewId); if (review == null) //review is new { review = new Review(); review.id = reviewId; context.Reviews.AddObject(review); } else //review already exists { continue; } HtmlNode node; //REVIEW.rating /*<td class="field rating"> * <label>Reb's rating</label> * <div class="value"> * <a class=" staticStars stars_4" title="really liked it">4 of 5 stars</a> * </div> * </td>*/ node = reviewNode.SelectSingleNode(".//td[@class='field rating']//a"); if (node != null) { string ratingClassString = node.GetAttributeValue("class", "0"); short rating = CrawlUtil.getRatingFromClassString(ratingClassString); review.starRating = rating; } //REVIEW.publishdate node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span"); if (node != null) { DateTime date; DateTime.TryParse(node.InnerText, out date); review.publishDate = date; } //USER review.userId = User.id; review.userIdString = User.userIdString; //BOOK node = reviewNode.SelectSingleNode(".//td[@class='field title']//a"); string bookUrl = node.GetAttributeValue("href", ""); int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0 Book book = CrawlUtil.getBook(context, bookId); if (book == null) { book = new Book(); book.id = bookId; string title = node.GetAttributeValue("title", ""); book.title = title; node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div"); if (node != null) { book.isbn = node.InnerText.Trim(); } //AUTHOR node = reviewNode.SelectSingleNode(".//td[@class='field author']//a"); if (node != null) { string authorUrl = node.GetAttributeValue("href", ""); int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0 Author author = CrawlUtil.getAuthor(context, authorId); if (author == null) { author = new Author(); author.id = authorId; author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim()); book.Author = author; } } } review.Book = book; context.SaveChanges(); CrawlReviewsOnUserProfile_Updated.count++; } CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile_Updated.count + " reviews crawled"); } }