public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; //on page 1, add other pages for user if (propertyBag.ResponseUri.OriginalString == CrawlListAndVotes.baseUri) { /* * * <div> * <span class="previous_page disabled">« previous</span> * <em class="current">1</em> * <a rel="next" href="/list/user_votes/1045275-natasha?page=2">2</a> * <a href="/list/user_votes/1045275-natasha?page=3">3</a> * <a href="/list/user_votes/1045275-natasha?page=4">4</a> * <a href="/list/user_votes/1045275-natasha?page=5">5</a> * <a class="next_page" rel="next" href="/list/user_votes/1045275-natasha?page=2">next »</a> * </div> */ var node = doc.SelectSingleNode(".//a[@class='next_page' and @rel='next']"); if (node != null) { try { var x = node.PreviousSibling.PreviousSibling; int maxPage = Int32.Parse(x.InnerText.Trim()); string uri; for (int i = 2; i <= maxPage; i++) { uri = "http://www.goodreads.com/list/user_votes/" + User.userIdString + "?page=" + i; crawler.AddStep(new Uri(uri), 0); CrawlListAndVotes.form.appendLineToLog(uri); } } catch (Exception ex) { CrawlListAndVotes.form.appendLineToLog(ex.Message); } } } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var listNode in doc.SelectNodes(".//div[@class='cell']")) { List l = null; string title = null; var titleNode = listNode.SelectSingleNode(".//a[@class='listTitle']"); if (titleNode != null) { title = titleNode.InnerText.Trim(); } if (title != null) { l = CrawlUtil.createOrGetList(context, title); } else { continue; } /* * 296 books * — * 994 voters */ var statsNode = listNode.SelectSingleNode(".//div[@class='listFullDetails']"); if (statsNode != null) { string s = statsNode.InnerText.Replace("\n", "").Trim(); l.numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); s = s.Substring(s.IndexOf("books")); l.numVoters = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); } User u = CrawlUtil.getUser(context, User.id); u.Lists.Add(l); try { context.SaveChanges(); CrawlListAndVotes.count++; } catch (Exception ex) { User.Lists.Remove(l); //this just prints out to check an inner exception which is a dupe PK error //CrawlListAndVotes.form.appendLineToLog(ex.Message); } } CrawlListAndVotes.form.appendLineToLog(User.userIdString + ":: " + CrawlListAndVotes.count + " lists added"); } }