示例#1
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            //only process the review page
            if (propertyBag.ResponseUri.OriginalString != CrawlReviews.baseUri)
            {
                return;
            }


            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                Review review = CrawlUtil.getReview(context, ReviewId);

                if (review == null)
                {
                    return; //this should never happen
                }

                //userId	int	Checked
                //userIdString	varchar(256)	Checked
                //reviewContent	varchar(MAX)	Checked
                //starRating	smallint	Checked
                //publishDate	date	Checked
                //recommendedFor	varchar(MAX)	Checked
                //numComments	int	Checked

                HtmlNode doc = htmlDoc.DocumentNode;

                int      userId        = -1;
                string   userIdString  = null;
                string   userName      = null;
                string   reviewContent = null;
                short    starRating    = -1;
                DateTime publishDate;
                //string recommendedFor = null;
                int numComments = -1;

                var reviewAuthorNode = doc.SelectSingleNode("//a[@itemprop='author']");
                if (reviewAuthorNode != null)
                {
                    userIdString = reviewAuthorNode.GetAttributeValue("href", null);

                    userIdString = CrawlUtil.extractUserIdStringFromUrl(userIdString); //null if error
                    userId       = CrawlUtil.extractUserIdFromString(userIdString);    //-1 or 0 if error
                    userName     = reviewAuthorNode.InnerText.Trim();                  //empty if error

                    if (userIdString != null && userId > 0 && !String.IsNullOrEmpty(userName))
                    {
                        var user = CrawlUtil.createOrGetUser(context, userId, userIdString, userName);
                        review.User         = user;
                        review.userIdString = userIdString;
                    }

                    //<a href="/user/show/52663-mer" class="userReview" itemprop="author">Mer</a>
                }

                var reviewContentNode = doc.SelectSingleNode("//div[@itemprop='reviewBody']");
                if (reviewContentNode != null)
                {
                    review.reviewContent = reviewContentNode.InnerText.Trim();
                }

                var starRatingNode = doc.SelectSingleNode("//div[@class='rating']/span[@class='value-title']");
                if (starRatingNode != null)
                {
                    short.TryParse(starRatingNode.GetAttributeValue("title", ""), out starRating);

                    review.starRating = starRating;
                    //<span class="value-title" title="5"></span>
                }

                var publishDateNode = doc.SelectSingleNode("//div[@class='right dtreviewed greyText smallText']/span[@class='value-title']");
                if (publishDateNode != null)
                {
                    DateTime.TryParse(publishDateNode.GetAttributeValue("title", ""), out publishDate);

                    review.publishDate = publishDate;
                    //<span class="value-title" title="2007-04-28"></span>
                }

                var recomendedForNode = doc.SelectSingleNode("//span[text()='Recommended for:']");
                if (recomendedForNode != null)
                {
                    review.recommendedFor = recomendedForNode.ParentNode.LastChild.InnerText.Trim();

                    /*
                     * <div>
                     *  <span class="">Recommended for:</span>
                     *     enviornmentalists, nurturers, parents and children who want to discuss empathy and reciprocity
                     * </div>
                     */
                }

                //var numCommentsNode = doc.SelectSingleNode("");
                //if (numCommentsNode != null)
                //{
                //    /*

                //     *  <h2 class="brownBackground">
                //     *  <div class="extraHeader">
                //     *      <a href="#comment_form" rel="nofollow">Post a comment &raquo;</a>
                //     *  </div>Comments
                //     *  <span class="smallText">
                //            (showing
                //            1-5
                //            of
                //            5)
                //         </span>
                //     * ...
                //     * </h2>
                //     */
                //}

                context.SaveChanges();
                CrawlReviews.form.appendLineToLog("Added review " + review.id + " by user " + review.User.name);
            }
        }