コード例 #1
0
        /*
         * [numFiveStarRatings][numFourStarRatings][numThreeStarRatings][numTwoStarRatings][numOneStarRatings]
         * <tr>
         *  <td align="left" style="padding-right:0.5em;padding-bottom:1px;white-space:nowrap;font-size:10px;">
         *      <a href="http://www.amazon.com/Already-Dead-Joe-Pitt-Novel/product-reviews/034547824X/ref=cm_cr_pr_hist_5?ie=UTF8&filterBy=addFiveStar&showViewpoints=0" style="font-family:Verdana,Arial,Helvetica,Sans-serif;">
         *          5 star
         *      </a>:
         *  </td>
         *  <td style="min-width:60; background-color: #eeeecc" width="60" align="left" class="tiny" title="49%">
         *      <...>
         *  </td>
         *  <td align="right" style="font-family:Verdana,Arial,Helvetica,Sans-serif;;font-size:10px;">
         *      &nbsp;(82)
         *  </td>
         * </tr>
         */
        internal static int GetStarRating(HtmlDocument htmlDoc, Book b, string starNum)
        {
            int rating = 0;

            var nStarNode = htmlDoc.DocumentNode.SelectSingleNode("//a[contains(@href,'filterBy=add" + starNum + "Star')]/../..");

            if (nStarNode != null)
            {
                try
                {
                    //this method works for a review page
                    rating = CrawlUtil.ExtractRatingFromStarString(CrawlUtil.TidyHtmlText(nStarNode.InnerText));

                    //try another method (for detail page)
                    if (rating == -1)
                    {
                        var n = nStarNode.SelectSingleNode("//a[contains(@href,'filterBy=add" + starNum + "Star')]");
                        rating = CrawlUtil.ExtractRatingFromStarStringFromDetailPage(CrawlUtil.TidyHtmlText(n.InnerText));
                    }

                    //if it failed, try another method (for detail page)
                    if (rating == -1)
                    {
                        rating = CrawlUtil.ExtractRatingFromStarStringFromDetailPage(CrawlUtil.TidyHtmlText(nStarNode.InnerText));
                    }
                }
                catch (Exception ex)
                {
                    //old code
                    //ReviewHtmlCrawler.form.appendLineToLog("***ERROR*** " + "\t** [" + starNum + " star rating] not found (" + ex.Message + ")");
                }
            }

            return(rating);
        }
コード例 #2
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            //using c.MaximumCrawlDepth = 1 means only the details page will be processed (no other links are crawled as they're depth 2)

            DateTime currentDateTime = DateTime.Now;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            BookHtmlCrawler.form.appendLineToLog("Crawling 'details page' for " + BookHtmlCrawler.currentBook.DisplayString);
            BookHtmlCrawler.form.appendLineToLog(propertyBag.ResponseUri.OriginalString);

            lock (this)
            {
                //check if the correct page is loaded (sometimes Amazon redirects to a robot check
                if (propertyBag.Title.ToLower() == "robot check")
                {
                    BookHtmlCrawler.form.appendLineToLog(propertyBag.Title + Environment.NewLine + propertyBag.Text.Substring(0, 100));

                    //block the user agent that's currently in use
                    BookHtmlCrawler.UserAgentTracker[crawler.UserAgent] = true;

                    //sleep between 30 secs and 3 minutes
                    TimeSpan duration = new TimeSpan(0, 0, RandomNumber.Next(30, 3 * 60));

                    BookHtmlCrawler.form.appendLineToLog(string.Format("Sleeping for {0}:{1} mins and secs before trying next book.", duration.Minutes, duration.Seconds));
                    System.Threading.Thread.Sleep(duration);
                    return;
                }

                string updated = "";
                using (AmazonCrawlerEntities context = DbUtil.getNewContext())
                {
                    Book   b = DbUtil.getBook(context, BookHtmlCrawler.currentBook.ISBN);
                    Author a = null;


                    if (b == null) //this should never happen
                    {
                        BookHtmlCrawler.form.appendLineToLog("[error] ISBN " + BookHtmlCrawler.currentBook.ISBN + " not found in database");
                        return;
                    }

                    if (BookHtmlCrawler.GetDetailsAndAuthor)
                    {
                        //author, language, reading level, format, sales ranks not stored.
                        #region average rating

                        var averageRatingNode = htmlDoc.DocumentNode.SelectSingleNode(@"//div[@id='avgRating']/span/a");
                        if (averageRatingNode != null)
                        {
                            string  ratingText = averageRatingNode.InnerText;
                            decimal rating     = CrawlUtil.ExtractRatingFromSummaryString(CrawlUtil.TidyHtmlText(ratingText));
                            b.avgCustomerReview = rating;
                            BookHtmlCrawler.form.appendLineToLog("\tavg: " + rating);
                        }
                        #endregion

                        #region review page URL
                        string url           = null;
                        var    reviewUrlNode = htmlDoc.DocumentNode.SelectSingleNode(@"//a[@class='a-link-emphasis a-nowrap']/@href");
                        if (reviewUrlNode != null)
                        {
                            var working_url = reviewUrlNode.GetAttributeValue("href", null);

                            if (!string.IsNullOrEmpty(working_url) && working_url.IndexOf("/ref=") > 0)
                            {
                                url = working_url.Substring(0, working_url.IndexOf("/ref="));
                            }
                        }

                        try //save for rating and reviewURL
                        {
                            BookHtmlCrawler.form.appendLineToLog("\treview URL added: " + url);
                            b.reviewPageURL = url ?? "-";

                            context.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            //ignore :(
                            BookHtmlCrawler.form.appendLineToLog(ex.Message);
                        }
                        #endregion

                        //2015-12-01: disabled this feature. It looks like the ratings are now in %s rather than numbers.
                        #region rating stats
                        //TEMP 2014-06-12: get star rating from details page
                        //b.statsCollectedAt = currentDateTime;

                        //b.numFiveStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FIVE);
                        //b.numFourStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.FOUR);
                        //b.numThreeStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.THREE);
                        //b.numTwoStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.TWO);
                        //b.numOneStarRatings = CrawlUtil.getStarRating(htmlDoc, b, CrawlUtil.ONE);

                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FIVE + " star: " + b.numFiveStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.FOUR + " star: " + b.numFourStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.THREE + " star: " + b.numThreeStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.TWO + " star: " + b.numTwoStarRatings);
                        //BookHtmlCrawler.form.appendLineToLog("\t" + CrawlUtil.ONE + " star: " + b.numOneStarRatings);

                        //try //save for rating stats
                        //{
                        //    context.SaveChanges();
                        //    updated += "RATING STATS;";
                        //}
                        //catch (Exception ex)
                        //{
                        //    //ignore :(
                        //    ReviewHtmlCrawler.form.appendLineToLog(ex.Message);
                        //}
                        #endregion

                        updated += "DETAILS (avg rating, reviewURL)";
                    }
                }

                BookHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + BookHtmlCrawler.currentBook.DisplayString);
            }
        }
コード例 #3
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            DateTime currentDateTime = DateTime.Now;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            AuthorHtmlCrawler.form.appendLineToLog("Crawling 'details (rankings) page' for " + AuthorHtmlCrawler.currentAuthor.DisplayString + " [ " + propertyBag.ResponseUri.OriginalString + " ]");

            lock (this)
            {
                AmazonCrawlerEntities context = DbUtil.getNewContext();

                Author a       = DbUtil.getAuthor(context, AuthorHtmlCrawler.currentAuthor.id);
                string updated = "rankings";

                if (a == null) //this should never happen
                {
                    AuthorHtmlCrawler.form.appendLineToLog("[error] author id " + AuthorHtmlCrawler.currentAuthor.id + " not found in database");
                    return;
                }

                //get rankings
                var rankingNodes = htmlDoc.DocumentNode.SelectNodes(".//div[@class='nodeRank']");
                if (rankingNodes != null)
                {
                    foreach (var rankNode in rankingNodes)
                    {
                        try
                        {
                            Ranking r = new Ranking();
                            r.authorId         = a.id;
                            r.statsCollectedAt = currentDateTime;
                            r.rankString       = CrawlUtil.TidyHtmlText(rankNode.InnerText);

                            AuthorHtmlCrawler.form.appendLineToLog(r.rankString);
                            context.AddToRankings(r);
                            context.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            AuthorHtmlCrawler.form.appendLineToLog("**ERROR** " + ex.Message);
                            if (ex.InnerException != null)
                            {
                                AuthorHtmlCrawler.form.appendLineToLog("\t** " + ex.InnerException.Message);
                            }
                        }
                    }
                }
                else
                {
                    try //add a ranking with null rankstring to mark this author has been processed
                    {
                        Ranking r = new Ranking();
                        r.authorId = a.id;

                        context.AddToRankings(r);
                        context.SaveChanges();
                    }
                    catch (Exception ex)
                    {
                    }
                }

                AuthorHtmlCrawler.form.appendLineToLog("Updated " + updated + " for " + AuthorHtmlCrawler.currentAuthor.DisplayString);
            }
        }