public static void Run(frmMain parentForm, Book book) { form = parentForm; IsolatedStorageModule.Setup(false); currentBook = book; existingReviewIds = CrawlUtil.getNewContext().Reviews.Where(r => r.bookId == currentBook.id).Select(r => r.id).ToList(); baseUri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&format=html&isbn=" + book.isbn + "&links=660&min_rating=&review_back=fff&stars=000&text=000"; c = new Crawler(new Uri(baseUri), new HtmlDocumentProcessor(), // Process html new ReviewIFrameDumperStep()); // Custom step to visualize crawl c.MaximumThreadCount = 1; //** 2012-09-03 changed this from 2 to 1 in hopes that it'll fix the unknown (seemingly) random crashes. c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; // Begin crawl c.Crawl(); }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; lock (this) { //there are 30 authors per page. //easier method is to get the count from the title if (propertyBag.ResponseUri.OriginalString == CrawlFavouriteAuthors.baseUri) { var node = doc.SelectSingleNode("//title"); if (node != null) { string s = node.InnerText.Trim(); User.numFavouriteAuthors = Convert.ToInt32(CrawlUtil.extractNumberFromString(s.Substring(s.IndexOf(" authors")))); } else { User.numFavouriteAuthors = 0; } } CrawlFavouriteAuthors.form.appendLineToLog(User.userIdString + ":: updated to have " + User.numFavouriteAuthors + " favourite authors."); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } int maxPage = CrawlUtil.getMaxReviewIFramePageNumber(htmlDoc); //add other review pages if at base (uri) page if (propertyBag.ResponseUri.ToString() == CrawlReviewIFrame.baseUri && maxPage != -1) { int maxPageToCrawl = maxPage; string uri = ""; //if (maxPage > 10) commenting this out means to crawl all review pages. //{ // maxPageToCrawl = 10; //} for (int i = 2; i <= maxPageToCrawl; i++) { uri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&format=html&isbn=" + CrawlReviewIFrame.currentBook.isbn + "&links=660&min_rating=&page=" + i + "&review_back=fff&stars=000&text=000"; crawler.AddStep(new Uri(uri), 0); } CrawlReviewIFrame.form.appendLineToLog("Crawling " + maxPageToCrawl + " pages of reviews for " + CrawlReviewIFrame.currentBook.getShortTitle()); } //only process review iframe pages if (!propertyBag.ResponseUri.OriginalString.StartsWith(CrawlReviewIFrame.baseUri.Substring(0, 100))) { return; } lock (this) { string currentPage = "0"; var currentPageNode = htmlDoc.DocumentNode.SelectSingleNode("//*[@class='current']"); if (currentPageNode != null) { currentPage = currentPageNode.InnerText.Trim(); } var reviews = htmlDoc.DocumentNode.SelectNodes("//*[@itemtype='http://schema.org/Review']"); if (reviews == null || reviews.Count == 0) { return; } //**do stuff to handle dupes properly // -_- //current method just saves each review one by one and ignores all errors when trying to save. //this also means all reviews are attempted to be saved again no matter what :( GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var r in reviews) { string reviewUrl; int reviewId = -1; Match match; var reviewLinkNode = r.SelectSingleNode(".//div[@class='gr_review_text']/link[@itemprop='url']"); DateTime publishDate = DateTime.MinValue; short starRating = 0; Review toAdd = new Review(); if (reviewLinkNode != null) { reviewUrl = reviewLinkNode.GetAttributeValue("href", "null"); match = regReview.Match(reviewUrl); if (Int32.TryParse(match.Groups[1].Value, out reviewId)) { if (CrawlReviewIFrame.existingReviewIds.Contains(reviewId)) { continue; } var node = r.SelectSingleNode(".//span[@class='gr_review_date']"); if (node != null) { DateTime.TryParse(node.InnerText, out publishDate); } node = r.SelectSingleNode(".//span[@class='gr_rating']"); if (node != null) { starRating = CrawlUtil.countStarsFromString(node.InnerText); } toAdd.id = reviewId; toAdd.bookId = CrawlReviewIFrame.currentBook.id; toAdd.publishDate = publishDate; toAdd.starRating = starRating; toAdd.foundOnPage = Int32.Parse(currentPage); toAdd.maxPage = maxPage; context.Reviews.AddObject(toAdd); } try { context.SaveChanges(); } catch (Exception ex) { context.Reviews.Detach(toAdd); CrawlReviewIFrame.form.appendLineToLog(ex.Message); if (ex.InnerException != null) { CrawlReviewIFrame.form.appendLineToLog("\t" + ex.InnerException.Message); } } } } CrawlReviewIFrame.form.appendLineToLog("Added " + reviews.Count + " on page " + currentPage + " of " + maxPage + " for " + CrawlReviewIFrame.currentBook.getShortTitle()); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { //add pages only if at base uri page if (propertyBag.ResponseUri.ToString() == "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + CrawlList.FromPage) { string uri = ""; for (int i = CrawlList.FromPage + 1; i <= CrawlList.ToPage; i++) { uri = "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + i; crawler.AddStep(new Uri(uri), 0); CrawlList.form.appendLineToLog("also crawling " + uri); } } //only process list pages if (!propertyBag.ResponseUri.OriginalString.StartsWith("http://www.goodreads.com/list/show/1.Best_Books_Ever")) { return; } var s = propertyBag["HtmlDoc"].Value; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc != null) { lock (this) { var books = htmlDoc.DocumentNode.SelectNodes("//tr[@itemtype='http://schema.org/Book\']"); if (books == null || books.Count == 0) { return; } GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var b in books) { string title = "null"; string authorName = "null"; var titleURLNode = b.SelectSingleNode(".//*[@class='bookTitle']"); var authorURLNode = b.SelectSingleNode(".//*[@class='authorName']"); string titleUrl = "null"; string authorUrl = "null"; Match match; string bookId = "-1"; string authorId = "-1"; Book newBook = null; Author author = null; if (titleURLNode != null && authorURLNode != null) { titleUrl = titleURLNode.GetAttributeValue("href", "null"); match = regBook.Match(titleUrl); bookId = match.Groups[1].Value; title = titleURLNode.InnerText.Trim(); authorUrl = authorURLNode.GetAttributeValue("href", "null"); match = regAuthor.Match(authorUrl); authorId = match.Groups[1].Value; authorName = authorURLNode.InnerText.Trim(); author = CrawlUtil.createOrGetAuthor(context, Int32.Parse(authorId), authorName); newBook = CrawlUtil.createOrGetBook(context, Int32.Parse(bookId), title); newBook.Author = author; //author.Book = newBook; } context.SaveChanges(); } CrawlList.form.appendLineToLog("added/updated " + books.Count + " books and their authors"); } } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } //only process the review page if (propertyBag.ResponseUri.OriginalString != CrawlReviews.baseUri) { return; } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); Review review = CrawlUtil.getReview(context, ReviewId); if (review == null) { return; //this should never happen } //userId int Checked //userIdString varchar(256) Checked //reviewContent varchar(MAX) Checked //starRating smallint Checked //publishDate date Checked //recommendedFor varchar(MAX) Checked //numComments int Checked HtmlNode doc = htmlDoc.DocumentNode; int userId = -1; string userIdString = null; string userName = null; string reviewContent = null; short starRating = -1; DateTime publishDate; //string recommendedFor = null; int numComments = -1; var reviewAuthorNode = doc.SelectSingleNode("//a[@itemprop='author']"); if (reviewAuthorNode != null) { userIdString = reviewAuthorNode.GetAttributeValue("href", null); userIdString = CrawlUtil.extractUserIdStringFromUrl(userIdString); //null if error userId = CrawlUtil.extractUserIdFromString(userIdString); //-1 or 0 if error userName = reviewAuthorNode.InnerText.Trim(); //empty if error if (userIdString != null && userId > 0 && !String.IsNullOrEmpty(userName)) { var user = CrawlUtil.createOrGetUser(context, userId, userIdString, userName); review.User = user; review.userIdString = userIdString; } //<a href="/user/show/52663-mer" class="userReview" itemprop="author">Mer</a> } var reviewContentNode = doc.SelectSingleNode("//div[@itemprop='reviewBody']"); if (reviewContentNode != null) { review.reviewContent = reviewContentNode.InnerText.Trim(); } var starRatingNode = doc.SelectSingleNode("//div[@class='rating']/span[@class='value-title']"); if (starRatingNode != null) { short.TryParse(starRatingNode.GetAttributeValue("title", ""), out starRating); review.starRating = starRating; //<span class="value-title" title="5"></span> } var publishDateNode = doc.SelectSingleNode("//div[@class='right dtreviewed greyText smallText']/span[@class='value-title']"); if (publishDateNode != null) { DateTime.TryParse(publishDateNode.GetAttributeValue("title", ""), out publishDate); review.publishDate = publishDate; //<span class="value-title" title="2007-04-28"></span> } var recomendedForNode = doc.SelectSingleNode("//span[text()='Recommended for:']"); if (recomendedForNode != null) { review.recommendedFor = recomendedForNode.ParentNode.LastChild.InnerText.Trim(); /* * <div> * <span class="">Recommended for:</span> * enviornmentalists, nurturers, parents and children who want to discuss empathy and reciprocity * </div> */ } //var numCommentsNode = doc.SelectSingleNode(""); //if (numCommentsNode != null) //{ // /* // * <h2 class="brownBackground"> // * <div class="extraHeader"> // * <a href="#comment_form" rel="nofollow">Post a comment »</a> // * </div>Comments // * <span class="smallText"> // (showing // 1-5 // of // 5) // </span> // * ... // * </h2> // */ //} context.SaveChanges(); CrawlReviews.form.appendLineToLog("Added review " + review.id + " by user " + review.User.name); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; //on page 1, add other pages for user if (propertyBag.ResponseUri.OriginalString == CrawlListAndVotes.baseUri) { /* * * <div> * <span class="previous_page disabled">« previous</span> * <em class="current">1</em> * <a rel="next" href="/list/user_votes/1045275-natasha?page=2">2</a> * <a href="/list/user_votes/1045275-natasha?page=3">3</a> * <a href="/list/user_votes/1045275-natasha?page=4">4</a> * <a href="/list/user_votes/1045275-natasha?page=5">5</a> * <a class="next_page" rel="next" href="/list/user_votes/1045275-natasha?page=2">next »</a> * </div> */ var node = doc.SelectSingleNode(".//a[@class='next_page' and @rel='next']"); if (node != null) { try { var x = node.PreviousSibling.PreviousSibling; int maxPage = Int32.Parse(x.InnerText.Trim()); string uri; for (int i = 2; i <= maxPage; i++) { uri = "http://www.goodreads.com/list/user_votes/" + User.userIdString + "?page=" + i; crawler.AddStep(new Uri(uri), 0); CrawlListAndVotes.form.appendLineToLog(uri); } } catch (Exception ex) { CrawlListAndVotes.form.appendLineToLog(ex.Message); } } } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var listNode in doc.SelectNodes(".//div[@class='cell']")) { List l = null; string title = null; var titleNode = listNode.SelectSingleNode(".//a[@class='listTitle']"); if (titleNode != null) { title = titleNode.InnerText.Trim(); } if (title != null) { l = CrawlUtil.createOrGetList(context, title); } else { continue; } /* * 296 books * — * 994 voters */ var statsNode = listNode.SelectSingleNode(".//div[@class='listFullDetails']"); if (statsNode != null) { string s = statsNode.InnerText.Replace("\n", "").Trim(); l.numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); s = s.Substring(s.IndexOf("books")); l.numVoters = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); } User u = CrawlUtil.getUser(context, User.id); u.Lists.Add(l); try { context.SaveChanges(); CrawlListAndVotes.count++; } catch (Exception ex) { User.Lists.Remove(l); //this just prints out to check an inner exception which is a dupe PK error //CrawlListAndVotes.form.appendLineToLog(ex.Message); } } CrawlListAndVotes.form.appendLineToLog(User.userIdString + ":: " + CrawlListAndVotes.count + " lists added"); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); HtmlNodeCollection nodes; HtmlNode node; //update user's status changes: private (default 0), author (default 0), or username var url = propertyBag.ResponseUri.AbsoluteUri; if (url != CrawlUserProfile.baseUri) //user is author or has changed useridstring { if (url == "http://www.goodreads.com/") //this means the user's profile no longer exists { CrawlUserProfile.form.appendLineToLog(string.Format("User {0} no longer exists." , User.userIdString )); User.IsPrivate = true; } if (url.Contains("/author/")) { User.IsAuthor = true; User.authorUrl = url; } if (url.Contains("/show/")) //both users and authors can change usernames { //when users change their username it can be extracted from the new url. if (User.IsAuthor != true) { User.userIdString = CrawlUtil.extractUserIdStringFromUrl(url); } else //if an author has changed username, then this cannot be detected from the authorurl (to which we are redirected) { //search for the new username using the user's id. e.g. for links to /user/show/37868189 node = doc.SelectSingleNode(".//a[starts-with(@href, '/user/show/" + User.id + "')]"); if (node != null) { var IdStringToExtract = node.GetAttributeValue("href", ""); User.userIdString = CrawlUtil.extractUserIdStringFromUrl(IdStringToExtract); } } } CrawlUserProfile.form.appendLineToLog(string.Format("User {0} updated with status IsAuthor = {1}| {2}; IsPrivate = {3}" , User.userIdString , User.IsAuthor , User.authorUrl , User.IsPrivate )); } //name node = doc.SelectSingleNode("//h1//text()[normalize-space()]"); if (node != null) { User.name = CrawlUtil.extractNameFromString(node.InnerText); //inner text for user: "******" // for author: "Derek White" } /* * <div class="leftContainer"> * <div class="leftAlignedImage" style="overflow: hidden; width: 110px;"> * <a href="/photo/user/3255548-ah" rel="nofollow" title="AH"> * <img alt="AH" src="http://d.gr-assets.com/users/1275413869p3/3255548.jpg" /> * </a> * <div class="smallText"> * <a href="/review/list/3255548?sort=rating&view=reviews">874 ratings</a> * <a href="#" onclick="Element.toggle('ratingDistribution3255548');; new Ajax.Updater('ratingDistribution3255548', '/user/rating_distribution/3255548', {asynchronous:true, evalScripts:true, method:'get', onComplete:function(request){return false;}, parameters:'authenticity_token=' + encodeURIComponent('dlcB28CHfXju2vqnShahQlNLoL76d9c6QNZMYZI332g=')}); return false;">(3.82 avg)</a> * <div class="floatingBox" style="display:none; width: 400px;" id="ratingDistribution3255548"></div> * <br/> * <a href="/review/list/3255548?sort=review&view=reviews">1218 reviews</a> * <br /> * <a href="/photo/user/3255548-ah" rel="nofollow">more photos (3)</a> * <br/> * <br/> #<a href="/user/best_reviewers?country=CA&duration=a">10 best reviewers</a> * <br/> #<a href="/user/top_reviewers?country=CA&duration=a">36 top reviewers</a> * * </div> * </div> * </div> * * 196 ratings * (3.29 avg) * 115 reviews * #10 best reviewers #36 top reviewers */ //numRatings //numReviews //avgRating //badges //EXCEPT badges, this code works for both user and author pages nodes = doc.SelectNodes(".//div[@class='leftContainer']//div[@class='smallText']//text()[normalize-space()]"); if (nodes == null) { //private profiles still have this data but in a different HTML layout. If this 'different layout' is found //then we assume this is a private profile. User.IsPrivate = true; CrawlUserProfile.form.appendLineToLog(string.Format("User {0} profile is deemed private." , User.userIdString )); return; } if (nodes != null) { int badgeNum = 1; //to get numerical values and badges foreach (var n in nodes) { string inner = n.InnerText.Trim();; //if text begins with "//" then it's script text and we want to skip. if (inner.StartsWith(@"//")) { continue; } //if text begins with "#" then it's a badge if (inner.StartsWith("#"))// && n.NextSibling != null) { switch (badgeNum) { case 1: User.badge1 = inner; break; case 2: User.badge2 = inner; break; case 3: User.badge3 = inner; break; case 4: User.badge4 = inner; break; case 5: User.badge5 = inner; break; } badgeNum++; } else { decimal d = CrawlUtil.extractNumberFromString(inner); if (d != -1) { if (inner.Contains("avg")) //author pages use "avg rating" which matches both "rating" and "avg" - so we match "avg" first { User.avgRating = d; } else if (inner.Contains("rating")) { User.numRatings = Convert.ToInt32(d); } else if (inner.Contains("review")) { User.numReviews = Convert.ToInt32(d); } } } } } //to get the 'badges' for authors if (User.IsAuthor == true) { nodes = doc.SelectNodes(".//div[@class='leftContainer']//div[@id='topListSection']//text()[normalize-space()]"); if (nodes != null) { int badgeNum = 1; foreach (var n in nodes) { string inner = n.InnerText.Trim(); if (inner.StartsWith("#"))// && n.NextSibling != null) { switch (badgeNum) { case 1: User.badge1 = inner; break; case 2: User.badge2 = inner; break; case 3: User.badge3 = inner; break; case 4: User.badge4 = inner; break; case 5: User.badge5 = inner; break; } badgeNum++; } } } } //<a href="/friend/user/104320-erin-beck" rel="nofollow">Erin’s Friends (4)</a> //<a rel="nofollow" href="/friend/user/3094317-tori-smexybooks-smexys-sidekick">(Tori-Smexybooks)’s Friends (1,505)</a> //numFriends //Works for both users and authors node = doc.SelectSingleNode(".//a[@href='/friend/user/" + User.userIdString + "']/text()"); if (node != null) { decimal d = CrawlUtil.extractNumberFromString(node.InnerText); if (d != -1) { User.numFriends = Convert.ToInt32(d); } } /* each friend's html * <div class="left"> * <div class="friendName"> * <a href="/user/show/355607-t-k-kenyon" rel="acquaintance">T.K. Kenyon</a> * </div> * 819 books * <span class="greyText">|</span> * 2,392 friends * </div> */ //8 friends' summary details if (CrawlUserProfile.addFriends) { nodes = doc.SelectNodes(".//div[@class='left']"); if (nodes != null) { foreach (var n in nodes) //each friend { Friendship f = new Friendship(); f.userId = this.User.id; node = n.SelectSingleNode(".//div[@class='friendName']/a"); if (node != null) { string attr = node.GetAttributeValue("href", ""); f.friendIdString = CrawlUtil.extractUserIdStringFromUrl(attr); f.rel = node.GetAttributeValue("rel", ""); } else { continue; } node = n.SelectSingleNode("./text()[normalize-space()]"); //number of books if (node != null) { int numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(node.InnerText)); if (numBooks != -1) { f.friendNumBooks = numBooks; } node = node.SelectSingleNode("following-sibling::text()"); //number of friends if (node != null) { int numFriends = Convert.ToInt32(CrawlUtil.extractNumberFromString(node.InnerText)); if (numFriends != -1) { f.friendNumFriends = numFriends; } } } try { context.Friendships.AddObject(f); context.SaveChanges(); } catch (Exception) { context.Friendships.Detach(f); } } } } //User.numFollowers //Users (non-authors) <a class="actionLink" rel="nofollow" href="/user/3094317-tori-smexybooks-smexys-sidekick/followers">319 people are following (Tori-Smexybooks)</a> //authors <a href="/author_followings?id=6458332&method=get">Lucas Lyndes’s Followers (8)</a> if (User.IsAuthor == true) { node = doc.SelectSingleNode(".//a[starts-with(@href, '/author_followings?')]/text()"); } else { node = doc.SelectSingleNode(".//a[@href='/user/" + User.userIdString + "/followers']/text()"); } if (node != null) { decimal d = CrawlUtil.extractNumberFromString(node.InnerText); if (d != -1) { User.numFollowers = Convert.ToInt32(d); } } //User.numUserIsFollowing - see [done]goodreads-numUserIsFollowing logic //N/A for authors if (User.IsAuthor == true) { User.numUserIsFollowing = null; } else { nodes = doc.SelectNodes(".//a[contains(text(),'is Following')]/../../../div[@class='bigBoxBody']//div/a"); if (nodes != null) { User.numUserIsFollowing = nodes.Count; } } //see [done]goodreads-quiz logic.html //User.quizNumCorrect //User.quizNumQuestions //User.quizRank //User.quizRankOutOf //N/A for authors if (User.IsAuthor == true) { User.quizNumCorrect = null; User.quizNumQuestions = null; User.quizRank = null; User.quizRankOutOf = null; } else { nodes = doc.SelectNodes(".//div[@id='myQuizStats']//div[@class='infoBoxRowTitle' or @class='infoBoxRowItem']"); if (nodes != null) { string s = ""; foreach (var n in nodes) { if (n.InnerText.Contains("questions answered")) { s = n.NextSibling.NextSibling.InnerText; User.quizNumQuestions = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); //CrawlUserProfile.form.appendLineToLog("answered: " + CrawlUtil.extractNumberFromString(s) + " (" + s + ")"); } else if (n.InnerText.Contains("correct")) { s = n.NextSibling.NextSibling.InnerText; User.quizNumCorrect = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); //CrawlUserProfile.form.appendLineToLog("correct: " + CrawlUtil.extractNumberFromString(s) + " (" + s + ")"); } else if (n.InnerText.Contains("ranking")) { s = n.NextSibling.NextSibling.InnerText; User.quizRank = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); //CrawlUserProfile.form.appendLineToLog("ranking: " + CrawlUtil.extractNumberFromString(s) + " (" + s + ")"); s = s.Substring(s.IndexOf("out of")); User.quizRankOutOf = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); //CrawlUserProfile.form.appendLineToLog("of: " + CrawlUtil.extractNumberFromString(s)); } } } } //User.ReadingChallenges - see [done]goodreads-challenge logic.html //CHECK if working for authors if (CrawlUserProfile.addChallenges) { nodes = doc.SelectNodes(".//div[@class='challengePic']"); if (nodes != null) { foreach (var n in nodes) { var challengePic = n.SelectSingleNode(".//img"); if (challengePic != null) { string challenge = challengePic.GetAttributeValue("alt", "unknown"); if (challenge != "unknown") { ReadingChallenge rc = CrawlUtil.createOrGetChallenge(context, User.id, challenge); try { var stats = n.NextSibling.NextSibling.SelectSingleNode(".//div[@class='bookMeta progressStats']"); if (stats != null) { string s = stats.InnerText; rc.numBooksRead = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); s = s.Substring(s.IndexOf(" of ")); rc.numBooksTarget = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); } else { stats = n.NextSibling.NextSibling.SelectSingleNode(".//a[@class='challengeBooksRead']"); if (stats != null) { string s = stats.InnerText; rc.numBooksRead = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); s = s.Substring(s.IndexOf(" of ")); rc.numBooksTarget = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); } } rc.lastUpdated = DateTime.Now; context.SaveChanges(); } catch (Exception ex) { context.ReadingChallenges.Detach(rc); } } } } } } /* * <a href="/librarian/user/255233-sonic" rel="nofollow">Goodreads librarian</a> */ //User.isLibrarian node = doc.SelectSingleNode(".//a[@href='/librarian/user/" + User.userIdString + "']"); if (node != null && node.InnerText == "Goodreads librarian") { User.IsLibrarian = true; } else { User.IsLibrarian = false; } //User.Genres - see [done]goodreads-genres (fav) logic.html //UNCHECKED for round 5 update if (CrawlUserProfile.addGenres) { nodes = doc.SelectNodes(".//h2[contains(text(),'Favorite Genres')]/../../div[@class='bigBoxBody']//a"); if (nodes != null) { foreach (var n in nodes) { Genre g = new Genre(); g.name = n.InnerText.Trim(); g.userId = User.id; try { context.AddToGenres(g); context.SaveChanges(); } catch (Exception) { context.Genres.Detach(g); } } } } //User.Activities - see [done]goodreads-updates logic.html //UNCHECKED for round 5 update if (CrawlUserProfile.addActivities) { nodes = doc.SelectNodes(".//table[@class='tableListReverse friendUpdates']/tr[@class='update' or @class='no_border']"); if (nodes != null) { foreach (var n in nodes) { if ("update" == n.GetAttributeValue("class", "unknown")) { Activity a = new Activity(); try { a.userId = User.id; a.activityHTML = n.InnerHtml.Length > 8000 ? n.InnerHtml.Substring(0, 7999) : n.InnerHtml; var ts = n.SelectSingleNode("following-sibling::tr//a[@class='updatedTimestamp']"); a.activityTimestampString = ts.InnerText; a.retrievedAt = DateTime.Now; context.AddToActivities(a); context.SaveChanges(); } catch (Exception ex) { context.Activities.Detach(a); } } } } } //separate function/class User.numFavouriteAuthors //separate function/class User.Groups //separate function/class User.Lists //separate function/class User.numShelves } CrawlUserProfile.form.appendLineToLog(User.userIdString + ":: " + "details and ticked updated."); }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; string temp = propertyBag.ResponseUri.OriginalString; string temp2 = CrawlReviewsOnUserProfile.baseUri.Substring(0, 54 + User.userIdString.Length); //on page 1, add other pages for user if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile.baseUri) { var node = doc.SelectSingleNode(".//div[@id='reviewPagination']"); if (node != null) { try { var x = node.LastChild.PreviousSibling.PreviousSibling; int maxPage = Int32.Parse(x.InnerText.Trim()); string uri; for (int i = 2; i <= maxPage; i++) { uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?format=html&page=" + i + "&per_page=100&shelf=read"; crawler.AddStep(new Uri(uri), 0); CrawlReviewsOnUserProfile.form.appendLineToLog(uri); } } catch (Exception) { } } } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']")) { CrawlReviewsOnUserProfile.count++; string reviewIdString = reviewNode.GetAttributeValue("id", ""); if (reviewIdString == "") { return; } int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString()); Review review = CrawlUtil.getReview(context, reviewId); //create and process the REVIEW if it doesn't already exist if (review == null) { HtmlNode node; review = new Review(); review.id = reviewId; //REVIEW.rating node = reviewNode.SelectSingleNode(".//td[@class='field rating']//img"); if (node != null) { string ratingString = node.GetAttributeValue("alt", "0"); short rating = short.Parse(ratingString.Substring(0, 1)); review.starRating = rating; } //REVIEW.publishdate node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span"); if (node != null) { DateTime date; DateTime.TryParse(node.InnerText, out date); review.publishDate = date; } //USER review.userId = User.id; review.userIdString = User.userIdString; //BOOK node = reviewNode.SelectSingleNode(".//td[@class='field title']//a"); string bookUrl = node.GetAttributeValue("href", ""); int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0 Book book = CrawlUtil.getBook(context, bookId); if (book == null) { book = new Book(); book.id = bookId; string title = node.GetAttributeValue("title", ""); book.title = title; node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div"); if (node != null) { book.isbn = node.InnerText.Trim(); } //AUTHOR node = reviewNode.SelectSingleNode(".//td[@class='field author']//a"); if (node != null) { string authorUrl = node.GetAttributeValue("href", ""); int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0 Author author = CrawlUtil.getAuthor(context, authorId); if (author == null) { author = new Author(); author.id = authorId; author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim()); book.Author = author; } } } review.Book = book; context.SaveChanges(); } } CrawlReviewsOnUserProfile.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile.count + " reviews crawled"); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; string temp = propertyBag.ResponseUri.OriginalString; string temp2 = CrawlReviewsOnUserProfile_Updated.baseUri.Substring(0, 54 + User.userIdString.Length); //on page 1, add other pages for user if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile_Updated.baseUri) { var node = doc.SelectSingleNode(".//div[@id='reviewPagination']"); if (node != null) { HtmlNode maxPageNode; int maxPage = 0; try { maxPageNode = node.LastChild.PreviousSibling.PreviousSibling; maxPage = Int32.Parse(maxPageNode.InnerText.Trim()); } catch (Exception) { CrawlReviewsOnUserProfile_Updated.form.appendLineToLog("Error getting maxPage on " + propertyBag.ResponseUri.OriginalString); return; } //get new reviews since last crawl? int pagesToCrawl = 0; if (CrawlReviewsOnUserProfile_Updated.getReviewsSinceLastCrawl) { pagesToCrawl = maxPage - (User.Reviews.Count / 20); //int division results in truncation if (pagesToCrawl < 1) { return; } /**** TEMP to get pages 30 and above (for users with more than 600 reviews (after getting up to 600 only on a previous run))****/ //for (int i = 30; i <= maxPage; i++) //{ // String s= "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews"; // crawler.AddStep(new Uri(s), 0); // CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(s); //} //return; /*** Old logic pre 2015 11 30 *** * int startPage = (User.Reviews.Count / 20)+1; * string uri; * for (int i = startPage; i <= maxPage; i++) * { * uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews"; * crawler.AddStep(new Uri(uri), 0); * * CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri); * } * return; * *************/ } else //crawl every page { pagesToCrawl = maxPage; } string uri; for (int i = 2; i <= pagesToCrawl; i++) { //http://www.goodreads.com/review/list/1-otis-chandler?page=3&print=true&shelf=read uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?page=" + i + "&print=true&shelf=read"; crawler.AddStep(new Uri(uri), 0); CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri); } //continue to with crawl on page 1 unless endPage is 0 (i.e. no pages need to be crawled) //2015-11-30: getting X latest pages is now redudant since reviews are now always sorted by date added ascending. //feature removed for 2015 update 5 crawl //get reviews from specified pages; or latest X pages //get user's latest X pages of reviews if user has more than (maxPage * 20) reviews; X determined by getHowManyLatestPages //if (maxPage > CrawlReviewsOnUserProfile_Updated.maxPage) //{ // if (CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages > 0) // { // int numLatestPages = Math.Min(CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages, maxPage - CrawlReviewsOnUserProfile_Updated.maxPage); // for (int i = 0; i < numLatestPages; i++) // { // string uriLatest = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + (maxPage - i) + "&print=true&shelf=read&sort=date_added&view=reviews"; // crawler.AddStep(new Uri(uriLatest), 0); // CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uriLatest); // } // } // maxPage = CrawlReviewsOnUserProfile_Updated.maxPage; //} //string u; //for (int i = CrawlReviewsOnUserProfile_Updated.minPage; i <= maxPage; i++) //{ // u = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews"; // crawler.AddStep(new Uri(u), 0); // CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(u); //} //if don't want to include page 1 then don't crawl after adding other pages to crawl //if (CrawlReviewsOnUserProfile_Updated.minPage > 1) //{ // return; //} } } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']")) { string reviewIdString = reviewNode.GetAttributeValue("id", ""); if (reviewIdString == "") { return; } int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString()); //Review review = CrawlUtil.createOrGetReview(context, reviewId); Review review = CrawlUtil.getReview(context, reviewId); if (review == null) //review is new { review = new Review(); review.id = reviewId; context.Reviews.AddObject(review); } else //review already exists { continue; } HtmlNode node; //REVIEW.rating /*<td class="field rating"> * <label>Reb's rating</label> * <div class="value"> * <a class=" staticStars stars_4" title="really liked it">4 of 5 stars</a> * </div> * </td>*/ node = reviewNode.SelectSingleNode(".//td[@class='field rating']//a"); if (node != null) { string ratingClassString = node.GetAttributeValue("class", "0"); short rating = CrawlUtil.getRatingFromClassString(ratingClassString); review.starRating = rating; } //REVIEW.publishdate node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span"); if (node != null) { DateTime date; DateTime.TryParse(node.InnerText, out date); review.publishDate = date; } //USER review.userId = User.id; review.userIdString = User.userIdString; //BOOK node = reviewNode.SelectSingleNode(".//td[@class='field title']//a"); string bookUrl = node.GetAttributeValue("href", ""); int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0 Book book = CrawlUtil.getBook(context, bookId); if (book == null) { book = new Book(); book.id = bookId; string title = node.GetAttributeValue("title", ""); book.title = title; node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div"); if (node != null) { book.isbn = node.InnerText.Trim(); } //AUTHOR node = reviewNode.SelectSingleNode(".//td[@class='field author']//a"); if (node != null) { string authorUrl = node.GetAttributeValue("href", ""); int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0 Author author = CrawlUtil.getAuthor(context, authorId); if (author == null) { author = new Author(); author.id = authorId; author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim()); book.Author = author; } } } review.Book = book; context.SaveChanges(); CrawlReviewsOnUserProfile_Updated.count++; } CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile_Updated.count + " reviews crawled"); } }