public void Process(Crawler crawler, PropertyBag propertyBag) { //add pages only if at base uri page if (propertyBag.ResponseUri.ToString() == "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + CrawlList.FromPage) { string uri = ""; for (int i = CrawlList.FromPage + 1; i <= CrawlList.ToPage; i++) { uri = "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + i; crawler.AddStep(new Uri(uri), 0); CrawlList.form.appendLineToLog("also crawling " + uri); } } //only process list pages if (!propertyBag.ResponseUri.OriginalString.StartsWith("http://www.goodreads.com/list/show/1.Best_Books_Ever")) { return; } var s = propertyBag["HtmlDoc"].Value; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc != null) { lock (this) { var books = htmlDoc.DocumentNode.SelectNodes("//tr[@itemtype='http://schema.org/Book\']"); if (books == null || books.Count == 0) { return; } GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var b in books) { string title = "null"; string authorName = "null"; var titleURLNode = b.SelectSingleNode(".//*[@class='bookTitle']"); var authorURLNode = b.SelectSingleNode(".//*[@class='authorName']"); string titleUrl = "null"; string authorUrl = "null"; Match match; string bookId = "-1"; string authorId = "-1"; Book newBook = null; Author author = null; if (titleURLNode != null && authorURLNode != null) { titleUrl = titleURLNode.GetAttributeValue("href", "null"); match = regBook.Match(titleUrl); bookId = match.Groups[1].Value; title = titleURLNode.InnerText.Trim(); authorUrl = authorURLNode.GetAttributeValue("href", "null"); match = regAuthor.Match(authorUrl); authorId = match.Groups[1].Value; authorName = authorURLNode.InnerText.Trim(); author = CrawlUtil.createOrGetAuthor(context, Int32.Parse(authorId), authorName); newBook = CrawlUtil.createOrGetBook(context, Int32.Parse(bookId), title); newBook.Author = author; //author.Book = newBook; } context.SaveChanges(); } CrawlList.form.appendLineToLog("added/updated " + books.Count + " books and their authors"); } } }