コード例 #1
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            //using c.MaximumCrawlDepth = 1 means only the details page will be processed (no other links are crawled as they're depth 2)

            DateTime currentDateTime = DateTime.Now;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            BookHtmlCrawler.form.appendLineToLog("Crawling 'details page' for " + BookHtmlCrawler.currentBook.DisplayString);
            BookHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.OriginalString);

            lock (this)
            {
                //check if the correct page is loaded (sometimes Amazon redirects to a robot check
                if (propertyBag.Title.ToLower() == "robot check")
                {
                    BookHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100));

                    //block the user agent that's currently in use
                    BookHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true;

                    //sleep between 30 secs and 3 minutes
                    TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 3 * 60));

                    BookHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds));
                    System.Threading.Thread.Sleep(duration);
                    return;
                }

                string updated = "";
                using (AmazonCrawlerEntities context = DbUtil.getNewContext())
                {
                    Book   b = DbUtil.getBook(context, BookHtmlCrawler.currentBook.ISBN);
                    Author a = null;


                    if (b == null) //this should never happen
                    {
                        BookHtmlCrawler.form.appendLineToLog("[error] ISBN " + BookHtmlCrawler.currentBook.ISBN + " not found in database");
                        return;
                    }

                    if (BookHtmlCrawler.GetDetailsAndAuthor)
                    {
                        //author, language, reading level, format, sales ranks not stored.
                        #region average rating

                        var averageRatingNode = htmlDoc.DocumentNode.SelectSingleNode(@"//div[@id='avgRating']/span/a");
                        if (averageRatingNode != null)
                        {
                            string  ratingText = averageRatingNode.InnerText;
                            decimal rating     = CrawlUtil.ExtractRatingFromSummaryString(CrawlUtil.TidyHtmlText(ratingText));
                            b.avgCustomerReview = rating;
                            BookHtmlCrawler.form.appendLineToLog("\tavg: " + rating);
                        }
                        #endregion

                        #region review page URL
                        string url           = null;
                        var    reviewUrlNode = htmlDoc.DocumentNode.SelectSingleNode(@"//a[@class='a-link-emphasis a-nowrap']/@href");
                        if (reviewUrlNode != null)
                        {
                            var working_url = reviewUrlNode.GetAttributeValue("href", null);

                            if (!string.IsNullOrEmpty(working_url) && working_url.IndexOf("/ref=") > 0)
                            {
                                url = working_url.Substring(0, working_url.IndexOf("/ref="));
                            }
                        }

                        try //save for rating and reviewURL
                        {
                            BookHtmlCrawler.form.appendLineToLog("\treview URL added: " + url);
                            b.reviewPageURL = url ?? "-";

                            context.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            //ignore :(
                            BookHtmlCrawler.form.appendLineToLog(ex.Message);
                        }
                        #endregion

                        //2015-12-01: disabled this feature. It looks like the ratings are now in %s rather than numbers.
                        #region rating stats
                        //TEMP 2014-06-12: get star rating from details page
                        //b.statsCollectedAt = currentDateTime;

                        //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE);
                        //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR);
                        //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE);
                        //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO);
                        //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE);

                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FIVE + " star: " + b.numFiveStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FOUR + " star: " + b.numFourStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.THREE + " star: " + b.numThreeStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.TWO + " star: " + b.numTwoStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.ONE + " star: " + b.numOneStarRatings);

                        //try //save for rating stats
                        //{
                        //    context.SaveChanges();
                        //    updated += "RATING STATS;";
                        //}
                        //catch (Exception ex)
                        //{
                        //    //ignore :(
                        //    ReviewHtmlCrawler.form.appendLineToLog(ex.Message);
                        //}
                        #endregion

                        updated += "DETAILS (avg rating, reviewURL)";
                    }
                }

                BookHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + BookHtmlCrawler.currentBook.DisplayString);
            }
        }
コード例 #2
0
        private void ProcessPage(string filePath)
        {
            DateTime currentDateTime = DateTime.Today;

            HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
            html.Load(filePath);

            HtmlNode doc = html.DocumentNode;

            appendLineToLog("Crawling a 'review page' for " + filePath);

            AmazonCrawlerEntities context = DbUtil.getNewContext();

            HtmlNode isbnNode = doc.SelectSingleNode(".//link[@rel='canonical']/@href");
            //<link rel="canonical" href="http://www.amazon.com/Breaking-Dawn-Twilight-Saga-Book/product-reviews/031606792X?pageNumber=113">
            Match  isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)\?");
            string isbn      = "";

            if (isbnMatch == Match.Empty)
            {
                isbnMatch = Regex.Match(isbnNode.OuterHtml, @".+/product-reviews/(.+)(/|"")");
            }
            isbn = isbnMatch.Groups[1].Value;

            Book b = DbUtil.getBook(context, isbn);

            if (b == null) //this should never happen
            {
                appendLineToLog("[error] ISBN " + isbn + " not found in database");
                return;
            }

            HtmlNodeCollection reviews = doc.SelectNodes(".//div[@class='a-section review']");

            int numProcessed    = 0;
            int numNotProcessed = 0;
            int numBadges       = 0;

            foreach (HtmlNode r in reviews)
            {
                //id
                string reviewId = r.Id;

                Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId);
                reviewToAdd.Book = b;


                //starRating
                HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]");
                short    rating           = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText);
                reviewToAdd.starRating = rating;


                //reviewTitle
                HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]");
                string   title     = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim());
                reviewToAdd.reviewTitle = title;


                //reviewerId and reviewerName and badges
                HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]");
                if (reviewerNode != null)
                {
                    string profileUrl = reviewerNode.GetAttributeValue("href", "");
                    string reviewerId = "";

                    Match m = Regex.Match(profileUrl, @"profile/(.+)/ref");
                    if (m.Groups.Count == 2)
                    {
                        reviewerId = m.Groups[1].Value;
                    }

                    string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText);

                    Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId);
                    reviewer.reviewerName = reviewerName;
                    reviewer.profileURL   = profileUrl;

                    reviewToAdd.reviewerId   = reviewerId;
                    reviewToAdd.reviewerName = reviewerName;

                    //badges
                    HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]");
                    if (badgeNodes != null)
                    {
                        AmazonCrawlerEntities tmp = DbUtil.getNewContext();

                        foreach (HtmlNode badgeNode in badgeNodes)
                        {
                            try
                            {
                                string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText);

                                ReviewerBadge newBadge = new ReviewerBadge();
                                newBadge.reviewerId       = reviewerId;
                                newBadge.badge            = badgeText;
                                newBadge.statsCollectedAt = currentDateTime;

                                tmp.ReviewerBadges.AddObject(newBadge);
                                tmp.SaveChanges();

                                numBadges += 1;
                            }
                            catch (Exception ex)
                            {
                                //ignore :(
                            }
                        }
                    }
                }

                //publishedDate
                HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]");
                string   dateText       = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText);
                dateText = dateText.Replace("on", "");
                DateTime publishedDate = DateTime.Parse(dateText);
                reviewToAdd.publishedDate = publishedDate;


                //reviewContent
                HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]");
                string   reviewContent  = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText);
                reviewToAdd.reviewContent = reviewContent;

                int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length;

                //numHelpful and numTotal
                //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more)
                HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]");
                if (helpfulNode != null)
                {
                    if (helpfulNode.InnerText.Contains("of"))
                    {
                        int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText);

                        reviewToAdd.numHelpful = helpful[0];
                        reviewToAdd.numTotal   = helpful[1];
                    }
                    else
                    {
                        Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase);
                        if (m.Groups.Count == 2)
                        {
                            reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value);
                        }
                        reviewToAdd.numTotal = null;
                    }
                }

                //numComments
                HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]");
                int      numComments     = 0;
                Int32.TryParse(numCommentsNode.InnerText, out numComments);
                reviewToAdd.numComments = numComments;


                //isAmazonVerifiedPurchase
                HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']");
                bool     isVerifiedPurchase   = false;
                if (verifiedPurchaseNode != null)
                {
                    isVerifiedPurchase = true;
                }
                reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase;

                //format
                HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a");
                if (formatNode != null)
                {
                    string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim();
                    reviewToAdd.format = formatText;
                }

                reviewToAdd.statsCollectedAt = currentDateTime;
                reviewToAdd.reviewType       = "R";

                ////debugging output
                //appendLineToLog("Processing review " + reviewId);
                //appendLineToLog(string.Format("\trating: {0}", rating));
                //appendLineToLog(string.Format("\ttitle: {0}", title));
                //appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName));
                ////badges not output
                //appendLineToLog(string.Format("\tPublished: {0}", publishedDate));
                //appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars)));
                //appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal));
                //appendLineToLog(string.Format("\tCommments: {0}", numComments));
                //appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase));
                //appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format));

                try
                {
                    context.SaveChanges();
                    numProcessed += 1;
                }
                catch (Exception ex)
                {
                    appendLineToLog(ex.Message);
                    appendLineToLog(ex.StackTrace);
                    numNotProcessed += 1;
                }
            }

            appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges));
        }
コード例 #3
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            DateTime currentDateTime = DateTime.Now;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            //check if the correct page is loaded (sometimes Amazon redirects to a robot check
            if (propertyBag.Title.ToLower() == "robot check")
            {
                ReviewHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100));

                //block the user agent that's currently in use
                ReviewHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true;

                //log the page for retry
                File.AppendAllText("REVIEW-to-retry.txt", propertyBag.ResponseUri.OriginalString + System.Environment.NewLine);

                //sleep between 30 secs and 1 minute
                TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 60));

                ReviewHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds));
                System.Threading.Thread.Sleep(duration);
                return;
            }

            ReviewHtmlCrawler.form.appendLineToLog("Crawling a 'review page' for " + ReviewHtmlCrawler.currentBook.DisplayString);
            ReviewHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.ToString());

            AmazonCrawlerEntities context = DbUtil.getNewContext();

            Book b = DbUtil.getBook(context, ReviewHtmlCrawler.currentBook.ISBN);

            if (b == null) //this should never happen
            {
                ReviewHtmlCrawler.form.appendLineToLog("[error] ISBN " + ReviewHtmlCrawler.currentBook.ISBN + " not found in database");
                return;
            }

            //add all other pages of reviews if crawling reviews and on page 1
            //TODO: also get star rating distribution
            if (ReviewHtmlCrawler.GetReviews && isFirstPage(propertyBag))
            {
                ReviewHtmlCrawler.form.appendLineToLog("crawling first page");

                int numPages = CrawlUtil.GetReviewLastPageNumber(htmlDoc.DocumentNode);
                CrawlUtil.AddReviewPagesToCrawl(crawler, propertyBag.ResponseUri.OriginalString, numPages);

                ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages));
                ReviewHtmlCrawler.form.appendLineToLog(string.Format("*** {0} pages to crawl ***", numPages));
                ReviewHtmlCrawler.form.appendLineToLog(string.Format("***************************", numPages));

                if (ReviewHtmlCrawler.GetRatingStats)
                {
                    #region rating stats
                    //b.statsCollectedAt = currentDateTime;

                    ////as of 2015-12-11 the rating break-down is expressed in percentages rather than numbers.

                    //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE);
                    //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR);
                    //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE);
                    //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO);
                    //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE);

                    //try //save for rating stats
                    //{
                    //    context.SaveChanges();
                    //    updated += "RATING STATS;";
                    //}
                    //catch (Exception ex)
                    //{
                    //    //ignore :(
                    //    ReviewHtmlCrawler.form.appendLineToLog(ex.Message);
                    //}
                    #endregion
                }
            }

            if (ReviewHtmlCrawler.GetReviews)
            {
                HtmlNodeCollection reviews = htmlDoc.DocumentNode.SelectNodes(".//div[@class='a-section review']");

                int numProcessed    = 0;
                int numNotProcessed = 0;
                int numBadges       = 0;

                foreach (HtmlNode r in reviews)
                {
                    //id
                    string reviewId = r.Id;

                    Review reviewToAdd = DbUtil.getOrCreateReview(context, reviewId);
                    reviewToAdd.Book = b;


                    //starRating
                    HtmlNode ratingStringNode = r.SelectSingleNode(".//i[starts-with(@class, 'a-icon a-icon-star')]");
                    short    rating           = CrawlUtil.GetStarRatingFromString(ratingStringNode.InnerText);
                    reviewToAdd.starRating = rating;


                    //reviewTitle
                    HtmlNode titleNode = r.SelectSingleNode(".//a[contains(@class, 'review-title')]");
                    string   title     = CrawlUtil.RemoveExcessWhitespaceFromString(titleNode.InnerText.Trim());
                    reviewToAdd.reviewTitle = title;


                    //reviewerId and reviewerName and badges
                    HtmlNode reviewerNode = r.SelectSingleNode(".//a[contains(@class, 'author')]");
                    if (reviewerNode != null)
                    {
                        string profileUrl = reviewerNode.GetAttributeValue("href", "");
                        string reviewerId = "";

                        Match m = Regex.Match(profileUrl, @"profile/(.+)/ref");
                        if (m.Groups.Count == 2)
                        {
                            reviewerId = m.Groups[1].Value;
                        }

                        string reviewerName = CrawlUtil.RemoveExcessWhitespaceFromString(reviewerNode.InnerText);

                        Reviewer reviewer = DbUtil.getOrCreateReviewer(context, reviewerId);
                        reviewer.reviewerName = reviewerName;
                        reviewer.profileURL   = profileUrl;

                        reviewToAdd.reviewerId   = reviewerId;
                        reviewToAdd.reviewerName = reviewerName;

                        //badges
                        HtmlNodeCollection badgeNodes = r.SelectNodes(".//a[contains(@class, 'c7y-badge-link')]");
                        if (badgeNodes != null)
                        {
                            AmazonCrawlerEntities tmp = DbUtil.getNewContext();

                            foreach (HtmlNode badgeNode in badgeNodes)
                            {
                                try
                                {
                                    string badgeText = CrawlUtil.RemoveExcessWhitespaceFromString(badgeNode.InnerText);

                                    ReviewerBadge newBadge = new ReviewerBadge();
                                    newBadge.reviewerId       = reviewerId;
                                    newBadge.badge            = badgeText;
                                    newBadge.statsCollectedAt = currentDateTime;

                                    tmp.ReviewerBadges.AddObject(newBadge);
                                    tmp.SaveChanges();

                                    numBadges += 1;
                                }
                                catch (Exception ex)
                                {
                                    //ignore :(
                                }
                            }
                        }
                    }

                    //publishedDate
                    HtmlNode reviewDateNode = r.SelectSingleNode(".//span[contains(@class, 'review-date')]");
                    string   dateText       = CrawlUtil.RemoveExcessWhitespaceFromString(reviewDateNode.InnerText);
                    dateText = dateText.Replace("on", "");
                    DateTime publishedDate = DateTime.Parse(dateText);
                    reviewToAdd.publishedDate = publishedDate;


                    //reviewContent
                    HtmlNode reviewTextNode = r.SelectSingleNode(".//span[contains(@class, 'review-text')]");
                    string   reviewContent  = CrawlUtil.RemoveExcessWhitespaceFromString(reviewTextNode.InnerText);
                    reviewToAdd.reviewContent = reviewContent;

                    int chars = reviewContent.Length > 200 ? 200 : reviewContent.Length;

                    //numHelpful and numTotal
                    //as at 2016-03-02 the 'helpful' text forrmat is "3 people found this helpful." (numTotal doesn't exist any more)
                    HtmlNode helpfulNode = r.SelectSingleNode(".//span[contains(@class, 'review-votes')]");
                    if (helpfulNode != null)
                    {
                        if (helpfulNode.InnerText.Contains("of"))
                        {
                            int[] helpful = CrawlUtil.GetNumbersFromHelpfulText(helpfulNode.InnerText);

                            reviewToAdd.numHelpful = helpful[0];
                            reviewToAdd.numTotal   = helpful[1];
                        }
                        else
                        {
                            Match m = Regex.Match(helpfulNode.InnerText, @"(\d+) p.+", RegexOptions.IgnoreCase);
                            if (m.Groups.Count == 2)
                            {
                                reviewToAdd.numHelpful = Int32.Parse(m.Groups[1].Value);
                            }
                            reviewToAdd.numTotal = null;
                        }
                    }

                    //numComments
                    HtmlNode numCommentsNode = r.SelectSingleNode(".//span[contains(@class, 'review-comment-total')]");
                    int      numComments     = 0;
                    Int32.TryParse(numCommentsNode.InnerText, out numComments);
                    reviewToAdd.numComments = numComments;


                    //isAmazonVerifiedPurchase
                    HtmlNode verifiedPurchaseNode = r.SelectSingleNode(".//span[@class = 'a-size-mini a-color-state a-text-bold']");
                    bool     isVerifiedPurchase   = false;
                    if (verifiedPurchaseNode != null)
                    {
                        isVerifiedPurchase = true;
                    }
                    reviewToAdd.isAmazonVerifiedPurchase = isVerifiedPurchase;

                    //format
                    HtmlNode formatNode = r.SelectSingleNode(".//div[contains(@class, 'review-data')]/a");
                    if (formatNode != null)
                    {
                        string formatText = CrawlUtil.RemoveExcessWhitespaceFromString(formatNode.InnerText).Replace("Format:", "").Trim();
                        reviewToAdd.format = formatText;
                    }

                    reviewToAdd.statsCollectedAt = currentDateTime;
                    reviewToAdd.reviewType       = "R";

                    //debugging output

                    //ReviewHtmlCrawler.form.appendLineToLog("Processing review " + reviewId);
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\trating: {0}", rating));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\ttitle: {0}", title));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tReviewer: {0} | {1}", reviewToAdd.reviewerId, reviewToAdd.reviewerName));
                    ////badges not output
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tPublished: {0}", publishedDate));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tContent: {0}", reviewContent.Substring(0, chars)));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tHelpful: {0} of {1}", reviewToAdd.numHelpful, reviewToAdd.numTotal));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tCommments: {0}", numComments));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tisVerifiedPurchase: {0}", isVerifiedPurchase));
                    //ReviewHtmlCrawler.form.appendLineToLog(string.Format("\tFormat: {0}", reviewToAdd.format));

                    try
                    {
                        context.SaveChanges();
                        numProcessed += 1;
                    }
                    catch (Exception ex)
                    {
                        ReviewHtmlCrawler.form.appendLineToLog(ex.Message);
                        ReviewHtmlCrawler.form.appendLineToLog(ex.StackTrace);
                        numNotProcessed += 1;
                    }
                }

                ReviewHtmlCrawler.form.appendLineToLog(string.Format("{0} reviews saved/updated on page; {1} not saved; {2} badges.", numProcessed, numNotProcessed, numBadges));
            }
        }
コード例 #4
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            DateTime currentDateTime = DateTime.Now;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            AuthorHtmlCrawler.form.appendLineToLog("Crawling 'details (rankings) page' for " + AuthorHtmlCrawler.currentAuthor.DisplayString + " [ " + propertyBag.ResponseUri.OriginalString + " ]");

            lock (this)
            {
                AmazonCrawlerEntities context = DbUtil.getNewContext();

                Author a       = DbUtil.getAuthor(context, AuthorHtmlCrawler.currentAuthor.id);
                string updated = "rankings";

                if (a == null) //this should never happen
                {
                    AuthorHtmlCrawler.form.appendLineToLog("[error] author id " + AuthorHtmlCrawler.currentAuthor.id + " not found in database");
                    return;
                }

                //get rankings
                var rankingNodes = htmlDoc.DocumentNode.SelectNodes(".//div[@class='nodeRank']");
                if (rankingNodes != null)
                {
                    foreach (var rankNode in rankingNodes)
                    {
                        try
                        {
                            Ranking r = new Ranking();
                            r.authorId         = a.id;
                            r.statsCollectedAt = currentDateTime;
                            r.rankString       = CrawlUtil.TidyHtmlText(rankNode.InnerText);

                            AuthorHtmlCrawler.form.appendLineToLog(r.rankString);
                            context.AddToRankings(r);
                            context.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            AuthorHtmlCrawler.form.appendLineToLog("**ERROR** " + ex.Message);
                            if (ex.InnerException != null)
                            {
                                AuthorHtmlCrawler.form.appendLineToLog("\t** " + ex.InnerException.Message);
                            }
                        }
                    }
                }
                else
                {
                    try //add a ranking with null rankstring to mark this author has been processed
                    {
                        Ranking r = new Ranking();
                        r.authorId = a.id;

                        context.AddToRankings(r);
                        context.SaveChanges();
                    }
                    catch (Exception ex)
                    {
                    }
                }

                AuthorHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + AuthorHtmlCrawler.currentAuthor.DisplayString);
            }
        }
コード例 #5
0
ファイル: MainForm.cs プロジェクト: ron-t/breview-a-crawler
        private void updateBooksDetail(string[] ISBNs)
        {
            ItemLookupRequest request = new ItemLookupRequest();

            if (ISBNs.Count() < 1)
            {
                //appendLineToLog("**No books selected");
                return;
            }

            string[] toProcess;
            /***** batch in 10s *****/
            for (int i = 0; i < ISBNs.Length; i = i + 10)
            {
                toProcess = ISBNs.Skip(i).Take(10).ToArray();

                //mark unprocessed books as'processed' by setting dummy values (but retain existing non null values)
                AmazonCrawlerEntities c = DbUtil.getNewContext();
                foreach (var isbn in toProcess)
                {
                    Book b = DbUtil.getBook(c, isbn);

                    if (chkTitle.Checked && b.title == null)
                    {
                        b.title = "-";
                    }

                    if (chkDetailUrl.Checked && b.detailPageURL == null)
                    {
                        b.detailPageURL = "-";
                    }
                }

                try
                {
                    c.SaveChanges();
                }
                catch (Exception ex)
                {
                    appendLineToLog("[error] preparing books from Db: " + ex.Message);
                    appendLineToLog("\t" + ex.StackTrace);
                    return;
                }

                request.ItemId = toProcess;
                request.IdType = ItemLookupRequestIdType.ISBN;
                //request.SearchIndex = "Books";

                request.ResponseGroup = new string[] { "ItemAttributes" };
                //request.ResponseGroup = new string[] { "Reviews", "Large", "SalesRank" };

                ItemLookup itemLookup = new ItemLookup();
                itemLookup.Request      = new ItemLookupRequest[] { request };
                itemLookup.AssociateTag = "notag"; //this is a required param, so I just use a dummy value which seems to work

                // send the ItemSearch request
                ItemLookupResponse response = amazonClient.ItemLookup(itemLookup);

                if (response.Items != null && response.Items.Count() > 0 && response.Items[0].Item != null)
                {
                    AmazonCrawlerEntities context = DbUtil.getNewContext();

                    // write out the results from the ItemSearch request
                    foreach (var item in response.Items[0].Item)
                    {
                        Book toUpdate = DbUtil.getBook(context, item.ASIN);

                        if (toUpdate != null)
                        {
                            int      parseOutput;
                            DateTime parseOutputDate = DateTime.MinValue;

                            if (item.ItemAttributes != null && item.ItemAttributes.Title != null)
                            {
                                toUpdate.title = item.ItemAttributes.Title;
                            }

                            //2012-10-31 21:51
                            //Not going to get sales rank from here. There are multiple ranks listed on the details page
                            //so will crawl that separately instead.
                            //The plan is to crawl it same time as getting rating stats AND author ranks so the stats are
                            //collected at roughly the same time.

                            //int.TryParse(item.SalesRank, out parseOutput);
                            //toUpdate.salesRank = parseOutput;

                            int.TryParse(item.ItemAttributes.NumberOfPages, out parseOutput);
                            if (parseOutput > 0)
                            {
                                toUpdate.pages = parseOutput;
                            }
                            else
                            {
                                toUpdate.pages = null;
                            }

                            toUpdate.publisher = item.ItemAttributes.Publisher;

                            DateTime.TryParse(item.ItemAttributes.PublicationDate, out parseOutputDate);
                            if (parseOutputDate.Equals(DateTime.MinValue))
                            {
                                //date format is just a year number.
                                DateTime.TryParse(item.ItemAttributes.PublicationDate + "/01/01", out parseOutputDate);
                            }
                            if (parseOutputDate > DateTime.MinValue)
                            {
                                toUpdate.publicationDate = parseOutputDate;
                            }
                            else
                            {
                                toUpdate.publicationDate = null;
                            }

                            toUpdate.detailPageURL = item.DetailPageURL.Substring(0, item.DetailPageURL.IndexOf("%3F"));

                            context.SaveChanges();
                            appendLineToLog(item.ItemAttributes.Title + " (" + item.ASIN + ") updated.");
                        }
                        else
                        {
                            appendLineToLog("[error] ISBN " + item.ASIN + " not found in database");
                        }
                    }
                    if (response.Items[0].Item.Count() != toProcess.Count())
                    {
                        appendLineToLog((toProcess.Count() - response.Items[0].Item.Count()) + " books skipped");
                    }
                }
                else
                {
                    appendLineToLog(toProcess.Count() + " books skipped.");

                    /********************
                    * Check if it's due to ItemID invalid error, if so then continue as normal
                    * ItemID invalid error just means the ISBN doesn't exist in Amazon's API.
                    * ******************/
                    if (response.Items != null && response.Items[0].Request != null && response.Items[0].Request.Errors != null)
                    {
                        var errorCode = response.Items[0].Request.Errors[0].Code;
                        if (errorCode == "AWS.InvalidParameterValue")
                        {
                            sleep();
                            continue;
                        }
                    }
                    //Otherwise there may be an API error
                    //undo the 'process' marker.
                    else
                    {
                        foreach (var isbn in toProcess)
                        {
                            Book b = DbUtil.getBook(c, isbn);

                            if (b.title == "-")
                            {
                                b.title = null;
                            }

                            if (b.detailPageURL == "-")
                            {
                                b.detailPageURL = null;
                            }
                        }

                        try
                        {
                            c.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            appendLineToLog("[error] preparing books from Db: " + ex.Message);
                            appendLineToLog("\t" + ex.StackTrace);
                            return;
                        }

                        //sleep between 3 and 10 minutes before continuing
                        TimeSpan duration = new TimeSpan(0, 0, ((3 * 60) + (RANDOM.Next(10 * 60))));
                        appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs", duration.Minutes, duration.Seconds));
                        System.Threading.Thread.Sleep(duration);
                    }
                }

                sleep(); //delay each API call
            }
        }