public void Process(Crawler crawler, PropertyBag propertyBag) { //using c.MaximumCrawlDepth = 1 means only the details page will be processed (no other links are crawled as they're depth 2) DateTime currentDateTime = DateTime.Now; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } BookHtmlCrawler.form.appendLineToLog("Crawling 'details page' for " + BookHtmlCrawler.currentBook.DisplayString); BookHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.OriginalString); lock (this) { //check if the correct page is loaded (sometimes Amazon redirects to a robot check if (propertyBag.Title.ToLower() == "robot check") { BookHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100)); //block the user agent that's currently in use BookHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true; //sleep between 30 secs and 3 minutes TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 3 * 60)); BookHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds)); System.Threading.Thread.Sleep(duration); return; } string updated = ""; using (AmazonCrawlerEntities context = DbUtil.getNewContext()) { Book b = DbUtil.getBook(context, BookHtmlCrawler.currentBook.ISBN); Author a = null; if (b == null) //this should never happen { BookHtmlCrawler.form.appendLineToLog("[error] ISBN " + BookHtmlCrawler.currentBook.ISBN + " not found in database"); return; } if (BookHtmlCrawler.GetDetailsAndAuthor) { //author, language, reading level, format, sales ranks not stored. #region average rating var averageRatingNode = htmlDoc.DocumentNode.SelectSingleNode(@"//div[@id='avgRating']/span/a"); if (averageRatingNode != null) { string ratingText = averageRatingNode.InnerText; decimal rating = CrawlUtil.ExtractRatingFromSummaryString(CrawlUtil.TidyHtmlText(ratingText)); b.avgCustomerReview = rating; BookHtmlCrawler.form.appendLineToLog("\tavg: " + rating); } #endregion #region review page URL string url = null; var reviewUrlNode = htmlDoc.DocumentNode.SelectSingleNode(@"//a[@class='a-link-emphasis a-nowrap']/@href"); if (reviewUrlNode != null) { var working_url = reviewUrlNode.GetAttributeValue("href", null); if (!string.IsNullOrEmpty(working_url) && working_url.IndexOf("/ref=") > 0) { url = working_url.Substring(0, working_url.IndexOf("/ref=")); } } try //save for rating and reviewURL { BookHtmlCrawler.form.appendLineToLog("\treview URL added: " + url); b.reviewPageURL = url ?? "-"; context.SaveChanges(); } catch (Exception ex) { //ignore :( BookHtmlCrawler.form.appendLineToLog(ex.Message); } #endregion //2015-12-01: disabled this feature. It looks like the ratings are now in %s rather than numbers. #region rating stats //TEMP 2014-06-12: get star rating from details page //b.statsCollectedAt = currentDateTime; //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE); //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR); //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE); //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO); //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FIVE + " star: " + b.numFiveStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FOUR + " star: " + b.numFourStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.THREE + " star: " + b.numThreeStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.TWO + " star: " + b.numTwoStarRatings); //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.ONE + " star: " + b.numOneStarRatings); //try //save for rating stats //{ // context.SaveChanges(); // updated += "RATING STATS;"; //} //catch (Exception ex) //{ // //ignore :( // ReviewHtmlCrawler.form.appendLineToLog(ex.Message); //} #endregion updated += "DETAILS (avg rating, reviewURL)"; } } BookHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + BookHtmlCrawler.currentBook.DisplayString); } }
private void ProcessPage(string filePath) { DateTime currentDateTime = DateTime.Today; HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument(); html.Load(filePath); HtmlNode doc = html.DocumentNode; appendLineToLog("Crawling a 'review page' for " + filePath); AmazonCrawlerEntities context = DbUtil.getNewContext(); HtmlNode isbnNode = doc.SelectSingleNode(".//link[@rel='canonical']/@href"); //<link rel="canonical" href="http://www.amazon.com/Breaking-Dawn-Twilight-Saga-Book/product-reviews/031606792X?pageNumber=113"> Match isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)\?"); string isbn = ""; if (isbnMatch == Match.Empty) { isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)(/|"")"); } isbn = isbnMatch.Groups[1].Value; Book b = DbUtil.getBook(context, isbn); if (b == null) //this should never happen { appendLineToLog("[error] ISBN " + isbn + " not found in database"); return; } HtmlNodeCollection reviews = doc.SelectNodes(".//div[@class='a-section review']"); int numProcessed = 0; int numNotProcessed = 0; int numBadges = 0; foreach (HtmlNode r in reviews) { //id string reviewId = r.Id; Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId); reviewToAdd.Book = b; //starRating HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]"); short rating = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText); reviewToAdd.starRating = rating; //reviewTitle HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]"); string title = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim()); reviewToAdd.reviewTitle = title; //reviewerId and reviewerName and badges HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]"); if (reviewerNode != null) { string profileUrl = reviewerNode.GetAttributeValue("href", ""); string reviewerId = ""; Match m = Regex.Match(profileUrl, @"profile/(.+)/ref"); if (m.Groups.Count == 2) { reviewerId = m.Groups[1].Value; } string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText); Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId); reviewer.reviewerName = reviewerName; reviewer.profileURL = profileUrl; reviewToAdd.reviewerId = reviewerId; reviewToAdd.reviewerName = reviewerName; //badges HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]"); if (badgeNodes != null) { AmazonCrawlerEntities tmp = DbUtil.getNewContext(); foreach (HtmlNode badgeNode in badgeNodes) { try { string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText); ReviewerBadge newBadge = new ReviewerBadge(); newBadge.reviewerId = reviewerId; newBadge.badge = badgeText; newBadge.statsCollectedAt = currentDateTime; tmp.ReviewerBadges.AddObject(newBadge); tmp.SaveChanges(); numBadges += 1; } catch (Exception ex) { //ignore :( } } } } //publishedDate HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]"); string dateText = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText); dateText = dateText.Replace("on", ""); DateTime publishedDate = DateTime.Parse(dateText); reviewToAdd.publishedDate = publishedDate; //reviewContent HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]"); string reviewContent = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText); reviewToAdd.reviewContent = reviewContent; int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length; //numHelpful and numTotal //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more) HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]"); if (helpfulNode != null) { if (helpfulNode.InnerText.Contains("of")) { int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText); reviewToAdd.numHelpful = helpful[0]; reviewToAdd.numTotal = helpful[1]; } else { Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase); if (m.Groups.Count == 2) { reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value); } reviewToAdd.numTotal = null; } } //numComments HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]"); int numComments = 0; Int32.TryParse(numCommentsNode.InnerText, out numComments); reviewToAdd.numComments = numComments; //isAmazonVerifiedPurchase HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']"); bool isVerifiedPurchase = false; if (verifiedPurchaseNode != null) { isVerifiedPurchase = true; } reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase; //format HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a"); if (formatNode != null) { string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim(); reviewToAdd.format = formatText; } reviewToAdd.statsCollectedAt = currentDateTime; reviewToAdd.reviewType = "R"; ////debugging output //appendLineToLog("Processing review " + reviewId); //appendLineToLog(string.Format("\trating: {0}", rating)); //appendLineToLog(string.Format("\ttitle: {0}", title)); //appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName)); ////badges not output //appendLineToLog(string.Format("\tPublished: {0}", publishedDate)); //appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars))); //appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal)); //appendLineToLog(string.Format("\tCommments: {0}", numComments)); //appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase)); //appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format)); try { context.SaveChanges(); numProcessed += 1; } catch (Exception ex) { appendLineToLog(ex.Message); appendLineToLog(ex.StackTrace); numNotProcessed += 1; } } appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges)); }
public void Process(Crawler crawler, PropertyBag propertyBag) { DateTime currentDateTime = DateTime.Now; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } //check if the correct page is loaded (sometimes Amazon redirects to a robot check if (propertyBag.Title.ToLower() == "robot check") { ReviewHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100)); //block the user agent that's currently in use ReviewHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true; //log the page for retry File.AppendAllText("REVIEW-to-retry.txt", propertyBag.ResponseUri.OriginalString + System.Environment.NewLine); //sleep between 30 secs and 1 minute TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 60)); ReviewHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds)); System.Threading.Thread.Sleep(duration); return; } ReviewHtmlCrawler.form.appendLineToLog("Crawling a 'review page' for " + ReviewHtmlCrawler.currentBook.DisplayString); ReviewHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.ToString()); AmazonCrawlerEntities context = DbUtil.getNewContext(); Book b = DbUtil.getBook(context, ReviewHtmlCrawler.currentBook.ISBN); if (b == null) //this should never happen { ReviewHtmlCrawler.form.appendLineToLog("[error] ISBN " + ReviewHtmlCrawler.currentBook.ISBN + " not found in database"); return; } //add all other pages of reviews if crawling reviews and on page 1 //TODO: also get star rating distribution if (ReviewHtmlCrawler.GetReviews && isFirstPage(propertyBag)) { ReviewHtmlCrawler.form.appendLineToLog("crawling first page"); int numPages = CrawlUtil.GetReviewLastPageNumber(htmlDoc.DocumentNode); CrawlUtil.AddReviewPagesToCrawl(crawler, propertyBag.ResponseUri.OriginalString, numPages); ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages)); ReviewHtmlCrawler.form.appendLineToLog(string.Format("*** {0} pages to crawl ***", numPages)); ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages)); if (ReviewHtmlCrawler.GetRatingStats) { #region rating stats //b.statsCollectedAt = currentDateTime; ////as of 2015-12-11 the rating break-down is expressed in percentages rather than numbers. //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE); //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR); //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE); //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO); //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE); //try //save for rating stats //{ // context.SaveChanges(); // updated += "RATING STATS;"; //} //catch (Exception ex) //{ // //ignore :( // ReviewHtmlCrawler.form.appendLineToLog(ex.Message); //} #endregion } } if (ReviewHtmlCrawler.GetReviews) { HtmlNodeCollection reviews = htmlDoc.DocumentNode.SelectNodes(".//div[@class='a-section review']"); int numProcessed = 0; int numNotProcessed = 0; int numBadges = 0; foreach (HtmlNode r in reviews) { //id string reviewId = r.Id; Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId); reviewToAdd.Book = b; //starRating HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]"); short rating = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText); reviewToAdd.starRating = rating; //reviewTitle HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]"); string title = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim()); reviewToAdd.reviewTitle = title; //reviewerId and reviewerName and badges HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]"); if (reviewerNode != null) { string profileUrl = reviewerNode.GetAttributeValue("href", ""); string reviewerId = ""; Match m = Regex.Match(profileUrl, @"profile/(.+)/ref"); if (m.Groups.Count == 2) { reviewerId = m.Groups[1].Value; } string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText); Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId); reviewer.reviewerName = reviewerName; reviewer.profileURL = profileUrl; reviewToAdd.reviewerId = reviewerId; reviewToAdd.reviewerName = reviewerName; //badges HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]"); if (badgeNodes != null) { AmazonCrawlerEntities tmp = DbUtil.getNewContext(); foreach (HtmlNode badgeNode in badgeNodes) { try { string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText); ReviewerBadge newBadge = new ReviewerBadge(); newBadge.reviewerId = reviewerId; newBadge.badge = badgeText; newBadge.statsCollectedAt = currentDateTime; tmp.ReviewerBadges.AddObject(newBadge); tmp.SaveChanges(); numBadges += 1; } catch (Exception ex) { //ignore :( } } } } //publishedDate HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]"); string dateText = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText); dateText = dateText.Replace("on", ""); DateTime publishedDate = DateTime.Parse(dateText); reviewToAdd.publishedDate = publishedDate; //reviewContent HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]"); string reviewContent = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText); reviewToAdd.reviewContent = reviewContent; int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length; //numHelpful and numTotal //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more) HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]"); if (helpfulNode != null) { if (helpfulNode.InnerText.Contains("of")) { int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText); reviewToAdd.numHelpful = helpful[0]; reviewToAdd.numTotal = helpful[1]; } else { Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase); if (m.Groups.Count == 2) { reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value); } reviewToAdd.numTotal = null; } } //numComments HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]"); int numComments = 0; Int32.TryParse(numCommentsNode.InnerText, out numComments); reviewToAdd.numComments = numComments; //isAmazonVerifiedPurchase HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']"); bool isVerifiedPurchase = false; if (verifiedPurchaseNode != null) { isVerifiedPurchase = true; } reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase; //format HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a"); if (formatNode != null) { string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim(); reviewToAdd.format = formatText; } reviewToAdd.statsCollectedAt = currentDateTime; reviewToAdd.reviewType = "R"; //debugging output //ReviewHtmlCrawler.form.appendLineToLog("Processing review " + reviewId); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\trating: {0}", rating)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\ttitle: {0}", title)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName)); ////badges not output //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tPublished: {0}", publishedDate)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars))); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tCommments: {0}", numComments)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase)); //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format)); try { context.SaveChanges(); numProcessed += 1; } catch (Exception ex) { ReviewHtmlCrawler.form.appendLineToLog(ex.Message); ReviewHtmlCrawler.form.appendLineToLog(ex.StackTrace); numNotProcessed += 1; } } ReviewHtmlCrawler.form.appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges)); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { DateTime currentDateTime = DateTime.Now; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } AuthorHtmlCrawler.form.appendLineToLog("Crawling 'details (rankings) page' for " + AuthorHtmlCrawler.currentAuthor.DisplayString + " [ " + propertyBag.ResponseUri.OriginalString + " ]"); lock (this) { AmazonCrawlerEntities context = DbUtil.getNewContext(); Author a = DbUtil.getAuthor(context, AuthorHtmlCrawler.currentAuthor.id); string updated = "rankings"; if (a == null) //this should never happen { AuthorHtmlCrawler.form.appendLineToLog("[error] author id " + AuthorHtmlCrawler.currentAuthor.id + " not found in database"); return; } //get rankings var rankingNodes = htmlDoc.DocumentNode.SelectNodes(".//div[@class='nodeRank']"); if (rankingNodes != null) { foreach (var rankNode in rankingNodes) { try { Ranking r = new Ranking(); r.authorId = a.id; r.statsCollectedAt = currentDateTime; r.rankString = CrawlUtil.TidyHtmlText(rankNode.InnerText); AuthorHtmlCrawler.form.appendLineToLog(r.rankString); context.AddToRankings(r); context.SaveChanges(); } catch (Exception ex) { AuthorHtmlCrawler.form.appendLineToLog("**ERROR** " + ex.Message); if (ex.InnerException != null) { AuthorHtmlCrawler.form.appendLineToLog("\t** " + ex.InnerException.Message); } } } } else { try //add a ranking with null rankstring to mark this author has been processed { Ranking r = new Ranking(); r.authorId = a.id; context.AddToRankings(r); context.SaveChanges(); } catch (Exception ex) { } } AuthorHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + AuthorHtmlCrawler.currentAuthor.DisplayString); } }
private void updateBooksDetail(string[] ISBNs) { ItemLookupRequest request = new ItemLookupRequest(); if (ISBNs.Count() < 1) { //appendLineToLog("**No books selected"); return; } string[] toProcess; /***** batch in 10s *****/ for (int i = 0; i < ISBNs.Length; i = i + 10) { toProcess = ISBNs.Skip(i).Take(10).ToArray(); //mark unprocessed books as'processed' by setting dummy values (but retain existing non null values) AmazonCrawlerEntities c = DbUtil.getNewContext(); foreach (var isbn in toProcess) { Book b = DbUtil.getBook(c, isbn); if (chkTitle.Checked && b.title == null) { b.title = "-"; } if (chkDetailUrl.Checked && b.detailPageURL == null) { b.detailPageURL = "-"; } } try { c.SaveChanges(); } catch (Exception ex) { appendLineToLog("[error] preparing books from Db: " + ex.Message); appendLineToLog("\t" + ex.StackTrace); return; } request.ItemId = toProcess; request.IdType = ItemLookupRequestIdType.ISBN; //request.SearchIndex = "Books"; request.ResponseGroup = new string[] { "ItemAttributes" }; //request.ResponseGroup = new string[] { "Reviews", "Large", "SalesRank" }; ItemLookup itemLookup = new ItemLookup(); itemLookup.Request = new ItemLookupRequest[] { request }; itemLookup.AssociateTag = "notag"; //this is a required param, so I just use a dummy value which seems to work // send the ItemSearch request ItemLookupResponse response = amazonClient.ItemLookup(itemLookup); if (response.Items != null && response.Items.Count() > 0 && response.Items[0].Item != null) { AmazonCrawlerEntities context = DbUtil.getNewContext(); // write out the results from the ItemSearch request foreach (var item in response.Items[0].Item) { Book toUpdate = DbUtil.getBook(context, item.ASIN); if (toUpdate != null) { int parseOutput; DateTime parseOutputDate = DateTime.MinValue; if (item.ItemAttributes != null && item.ItemAttributes.Title != null) { toUpdate.title = item.ItemAttributes.Title; } //2012-10-31 21:51 //Not going to get sales rank from here. There are multiple ranks listed on the details page //so will crawl that separately instead. //The plan is to crawl it same time as getting rating stats AND author ranks so the stats are //collected at roughly the same time. //int.TryParse(item.SalesRank, out parseOutput); //toUpdate.salesRank = parseOutput; int.TryParse(item.ItemAttributes.NumberOfPages, out parseOutput); if (parseOutput > 0) { toUpdate.pages = parseOutput; } else { toUpdate.pages = null; } toUpdate.publisher = item.ItemAttributes.Publisher; DateTime.TryParse(item.ItemAttributes.PublicationDate, out parseOutputDate); if (parseOutputDate.Equals(DateTime.MinValue)) { //date format is just a year number. DateTime.TryParse(item.ItemAttributes.PublicationDate + "/01/01", out parseOutputDate); } if (parseOutputDate > DateTime.MinValue) { toUpdate.publicationDate = parseOutputDate; } else { toUpdate.publicationDate = null; } toUpdate.detailPageURL = item.DetailPageURL.Substring(0, item.DetailPageURL.IndexOf("%3F")); context.SaveChanges(); appendLineToLog(item.ItemAttributes.Title + " (" + item.ASIN + ") updated."); } else { appendLineToLog("[error] ISBN " + item.ASIN + " not found in database"); } } if (response.Items[0].Item.Count() != toProcess.Count()) { appendLineToLog((toProcess.Count() - response.Items[0].Item.Count()) + " books skipped"); } } else { appendLineToLog(toProcess.Count() + " books skipped."); /******************** * Check if it's due to ItemID invalid error, if so then continue as normal * ItemID invalid error just means the ISBN doesn't exist in Amazon's API. * ******************/ if (response.Items != null && response.Items[0].Request != null && response.Items[0].Request.Errors != null) { var errorCode = response.Items[0].Request.Errors[0].Code; if (errorCode == "AWS.InvalidParameterValue") { sleep(); continue; } } //Otherwise there may be an API error //undo the 'process' marker. else { foreach (var isbn in toProcess) { Book b = DbUtil.getBook(c, isbn); if (b.title == "-") { b.title = null; } if (b.detailPageURL == "-") { b.detailPageURL = null; } } try { c.SaveChanges(); } catch (Exception ex) { appendLineToLog("[error] preparing books from Db: " + ex.Message); appendLineToLog("\t" + ex.StackTrace); return; } //sleep between 3 and 10 minutes before continuing TimeSpan duration = new TimeSpan(0, 0, ((3 * 60) + (RANDOM.Next(10 * 60)))); appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs", duration.Minutes, duration.Seconds)); System.Threading.Thread.Sleep(duration); } } sleep(); //delay each API call } }