C# (CSharp) GoodReadsCrawler CrawlUtil.getReview Beispiele

Programmiersprache: C# (CSharp)

Namespace / Paketname: GoodReadsCrawler

Klasse / Typ: CrawlUtil

Methode / Funktion: getReview

Beispiele auf hotexamples.com: 3

C# (CSharp) GoodReadsCrawler CrawlUtil.getReview - 3 Beispiele gefunden. Dies sind die am besten bewerteten C# (CSharp) Beispiele für die GoodReadsCrawler.CrawlUtil.getReview, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

getNewContext(8)

getReview(3)

extractNumberFromString(3)

formatAuthorName(2)

extractIdNumberFromUrl(2)

getBook(2)

getAuthor(2)

extractUserIdStringFromUrl(2)

countStarsFromString(1)

getRatingFromClassString(1)

getMaxReviewIFramePageNumber(1)

extractUserIdFromString(1)

createOrGetAuthor(1)

extractNameFromString(1)

createOrGetUser(1)

createOrGetList(1)

createOrGetChallenge(1)

createOrGetBook(1)

getUser(1)

Beispiel #1

Datei anzeigen

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            string temp  = propertyBag.ResponseUri.OriginalString;
            string temp2 = CrawlReviewsOnUserProfile.baseUri.Substring(0, 54 + User.userIdString.Length);

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile.baseUri)
            {
                var node = doc.SelectSingleNode(".//div[@id='reviewPagination']");

                if (node != null)
                {
                    try
                    {
                        var x       = node.LastChild.PreviousSibling.PreviousSibling;
                        int maxPage = Int32.Parse(x.InnerText.Trim());

                        string uri;
                        for (int i = 2; i <= maxPage; i++)
                        {
                            uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?format=html&page=" + i + "&per_page=100&shelf=read";
                            crawler.AddStep(new Uri(uri), 0);

                            CrawlReviewsOnUserProfile.form.appendLineToLog(uri);
                        }
                    }
                    catch (Exception)
                    {
                    }
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']"))
                {
                    CrawlReviewsOnUserProfile.count++;

                    string reviewIdString = reviewNode.GetAttributeValue("id", "");

                    if (reviewIdString == "")
                    {
                        return;
                    }

                    int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString());

                    Review review = CrawlUtil.getReview(context, reviewId);

                    //create and process the REVIEW if it doesn't already exist
                    if (review == null)
                    {
                        HtmlNode node;
                        review = new Review();

                        review.id = reviewId;

                        //REVIEW.rating
                        node = reviewNode.SelectSingleNode(".//td[@class='field rating']//img");
                        if (node != null)
                        {
                            string ratingString = node.GetAttributeValue("alt", "0");
                            short  rating       = short.Parse(ratingString.Substring(0, 1));

                            review.starRating = rating;
                        }

                        //REVIEW.publishdate
                        node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span");
                        if (node != null)
                        {
                            DateTime date;
                            DateTime.TryParse(node.InnerText, out date);

                            review.publishDate = date;
                        }

                        //USER
                        review.userId       = User.id;
                        review.userIdString = User.userIdString;

                        //BOOK
                        node = reviewNode.SelectSingleNode(".//td[@class='field title']//a");
                        string bookUrl = node.GetAttributeValue("href", "");

                        int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0

                        Book book = CrawlUtil.getBook(context, bookId);

                        if (book == null)
                        {
                            book    = new Book();
                            book.id = bookId;

                            string title = node.GetAttributeValue("title", "");
                            book.title = title;

                            node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div");
                            if (node != null)
                            {
                                book.isbn = node.InnerText.Trim();
                            }

                            //AUTHOR
                            node = reviewNode.SelectSingleNode(".//td[@class='field author']//a");
                            if (node != null)
                            {
                                string authorUrl = node.GetAttributeValue("href", "");

                                int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0

                                Author author = CrawlUtil.getAuthor(context, authorId);

                                if (author == null)
                                {
                                    author    = new Author();
                                    author.id = authorId;

                                    author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim());

                                    book.Author = author;
                                }
                            }
                        }

                        review.Book = book;

                        context.SaveChanges();
                    }
                }

                CrawlReviewsOnUserProfile.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile.count + " reviews crawled");
            }
        }

Beispiel #2

Datei anzeigen

Datei: CrawlReviews.cs Projekt: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            //only process the review page
            if (propertyBag.ResponseUri.OriginalString != CrawlReviews.baseUri)
            {
                return;
            }


            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                Review review = CrawlUtil.getReview(context, ReviewId);

                if (review == null)
                {
                    return; //this should never happen
                }

                //userId	int	Checked
                //userIdString	varchar(256)	Checked
                //reviewContent	varchar(MAX)	Checked
                //starRating	smallint	Checked
                //publishDate	date	Checked
                //recommendedFor	varchar(MAX)	Checked
                //numComments	int	Checked

                HtmlNode doc = htmlDoc.DocumentNode;

                int      userId        = -1;
                string   userIdString  = null;
                string   userName      = null;
                string   reviewContent = null;
                short    starRating    = -1;
                DateTime publishDate;
                //string recommendedFor = null;
                int numComments = -1;

                var reviewAuthorNode = doc.SelectSingleNode("//a[@itemprop='author']");
                if (reviewAuthorNode != null)
                {
                    userIdString = reviewAuthorNode.GetAttributeValue("href", null);

                    userIdString = CrawlUtil.extractUserIdStringFromUrl(userIdString); //null if error
                    userId       = CrawlUtil.extractUserIdFromString(userIdString);    //-1 or 0 if error
                    userName     = reviewAuthorNode.InnerText.Trim();                  //empty if error

                    if (userIdString != null && userId > 0 && !String.IsNullOrEmpty(userName))
                    {
                        var user = CrawlUtil.createOrGetUser(context, userId, userIdString, userName);
                        review.User         = user;
                        review.userIdString = userIdString;
                    }

                    //<a href="/user/show/52663-mer" class="userReview" itemprop="author">Mer</a>
                }

                var reviewContentNode = doc.SelectSingleNode("//div[@itemprop='reviewBody']");
                if (reviewContentNode != null)
                {
                    review.reviewContent = reviewContentNode.InnerText.Trim();
                }

                var starRatingNode = doc.SelectSingleNode("//div[@class='rating']/span[@class='value-title']");
                if (starRatingNode != null)
                {
                    short.TryParse(starRatingNode.GetAttributeValue("title", ""), out starRating);

                    review.starRating = starRating;
                    //<span class="value-title" title="5"></span>
                }

                var publishDateNode = doc.SelectSingleNode("//div[@class='right dtreviewed greyText smallText']/span[@class='value-title']");
                if (publishDateNode != null)
                {
                    DateTime.TryParse(publishDateNode.GetAttributeValue("title", ""), out publishDate);

                    review.publishDate = publishDate;
                    //<span class="value-title" title="2007-04-28"></span>
                }

                var recomendedForNode = doc.SelectSingleNode("//span[text()='Recommended for:']");
                if (recomendedForNode != null)
                {
                    review.recommendedFor = recomendedForNode.ParentNode.LastChild.InnerText.Trim();

                    /*
                     * <div>
                     *  <span class="">Recommended for:</span>
                     *     enviornmentalists, nurturers, parents and children who want to discuss empathy and reciprocity
                     * </div>
                     */
                }

                //var numCommentsNode = doc.SelectSingleNode("");
                //if (numCommentsNode != null)
                //{
                //    /*

                //     *  <h2 class="brownBackground">
                //     *  <div class="extraHeader">
                //     *      <a href="#comment_form" rel="nofollow">Post a comment &raquo;</a>
                //     *  </div>Comments
                //     *  <span class="smallText">
                //            (showing
                //            1-5
                //            of
                //            5)
                //         </span>
                //     * ...
                //     * </h2>
                //     */
                //}

                context.SaveChanges();
                CrawlReviews.form.appendLineToLog("Added review " + review.id + " by user " + review.User.name);
            }
        }

Beispiel #3

Datei anzeigen

Datei: CrawlReviewsOnUserProfile_Updated.cs Projekt: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            string temp  = propertyBag.ResponseUri.OriginalString;
            string temp2 = CrawlReviewsOnUserProfile_Updated.baseUri.Substring(0, 54 + User.userIdString.Length);

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile_Updated.baseUri)
            {
                var node = doc.SelectSingleNode(".//div[@id='reviewPagination']");

                if (node != null)
                {
                    HtmlNode maxPageNode;
                    int      maxPage = 0;

                    try
                    {
                        maxPageNode = node.LastChild.PreviousSibling.PreviousSibling;
                        maxPage     = Int32.Parse(maxPageNode.InnerText.Trim());
                    }
                    catch (Exception)
                    {
                        CrawlReviewsOnUserProfile_Updated.form.appendLineToLog("Error getting maxPage on " + propertyBag.ResponseUri.OriginalString);
                        return;
                    }

                    //get new reviews since last crawl?
                    int pagesToCrawl = 0;
                    if (CrawlReviewsOnUserProfile_Updated.getReviewsSinceLastCrawl)
                    {
                        pagesToCrawl = maxPage - (User.Reviews.Count / 20); //int division results in truncation
                        if (pagesToCrawl < 1)
                        {
                            return;
                        }

                        /**** TEMP to get pages 30 and above (for users with more than 600 reviews (after getting up to 600 only on a previous run))****/
                        //for (int i = 30; i <= maxPage; i++)
                        //{
                        //    String s= "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                        //    crawler.AddStep(new Uri(s), 0);

                        //    CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(s);
                        //}
                        //return;

                        /*** Old logic pre 2015 11 30 ***
                         * int startPage = (User.Reviews.Count / 20)+1;
                         * string uri;
                         * for (int i = startPage; i <= maxPage; i++)
                         * {
                         *  uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                         *  crawler.AddStep(new Uri(uri), 0);
                         *
                         *  CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri);
                         * }
                         * return;
                         * *************/
                    }
                    else //crawl every page
                    {
                        pagesToCrawl = maxPage;
                    }

                    string uri;
                    for (int i = 2; i <= pagesToCrawl; i++)
                    {
                        //http://www.goodreads.com/review/list/1-otis-chandler?page=3&print=true&shelf=read
                        uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?page=" + i + "&print=true&shelf=read";
                        crawler.AddStep(new Uri(uri), 0);

                        CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri);
                    }
                    //continue to with crawl on page 1 unless endPage is 0 (i.e. no pages need to be crawled)


                    //2015-11-30: getting X latest pages is now redudant since reviews are now always sorted by date added ascending.
                    //feature removed for 2015 update 5 crawl

                    //get reviews from specified pages; or latest X pages
                    //get user's latest X pages of reviews if user has more than (maxPage * 20) reviews; X determined by getHowManyLatestPages
                    //if (maxPage > CrawlReviewsOnUserProfile_Updated.maxPage)
                    //{
                    //    if (CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages > 0)
                    //    {
                    //        int numLatestPages = Math.Min(CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages, maxPage - CrawlReviewsOnUserProfile_Updated.maxPage);
                    //        for (int i = 0; i < numLatestPages; i++)
                    //        {
                    //            string uriLatest = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + (maxPage - i) + "&print=true&shelf=read&sort=date_added&view=reviews";
                    //            crawler.AddStep(new Uri(uriLatest), 0);

                    //            CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uriLatest);
                    //        }
                    //    }

                    //    maxPage = CrawlReviewsOnUserProfile_Updated.maxPage;
                    //}

                    //string u;
                    //for (int i = CrawlReviewsOnUserProfile_Updated.minPage; i <= maxPage; i++)
                    //{
                    //    u = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                    //    crawler.AddStep(new Uri(u), 0);

                    //    CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(u);
                    //}

                    //if don't want to include page 1 then don't crawl after adding other pages to crawl
                    //if (CrawlReviewsOnUserProfile_Updated.minPage > 1)
                    //{
                    //    return;
                    //}
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']"))
                {
                    string reviewIdString = reviewNode.GetAttributeValue("id", "");

                    if (reviewIdString == "")
                    {
                        return;
                    }

                    int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString());

                    //Review review = CrawlUtil.createOrGetReview(context, reviewId);
                    Review review = CrawlUtil.getReview(context, reviewId);

                    if (review == null) //review is new
                    {
                        review    = new Review();
                        review.id = reviewId;

                        context.Reviews.AddObject(review);
                    }
                    else //review already exists
                    {
                        continue;
                    }

                    HtmlNode node;

                    //REVIEW.rating

                    /*<td class="field rating">
                     *  <label>Reb's rating</label>
                     *  <div class="value">
                     *      <a class=" staticStars stars_4" title="really liked it">4 of 5 stars</a>
                     *  </div>
                     * </td>*/
                    node = reviewNode.SelectSingleNode(".//td[@class='field rating']//a");
                    if (node != null)
                    {
                        string ratingClassString = node.GetAttributeValue("class", "0");
                        short  rating            = CrawlUtil.getRatingFromClassString(ratingClassString);

                        review.starRating = rating;
                    }

                    //REVIEW.publishdate
                    node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span");
                    if (node != null)
                    {
                        DateTime date;
                        DateTime.TryParse(node.InnerText, out date);

                        review.publishDate = date;
                    }

                    //USER
                    review.userId       = User.id;
                    review.userIdString = User.userIdString;

                    //BOOK
                    node = reviewNode.SelectSingleNode(".//td[@class='field title']//a");
                    string bookUrl = node.GetAttributeValue("href", "");

                    int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0

                    Book book = CrawlUtil.getBook(context, bookId);

                    if (book == null)
                    {
                        book    = new Book();
                        book.id = bookId;

                        string title = node.GetAttributeValue("title", "");
                        book.title = title;

                        node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div");
                        if (node != null)
                        {
                            book.isbn = node.InnerText.Trim();
                        }

                        //AUTHOR
                        node = reviewNode.SelectSingleNode(".//td[@class='field author']//a");
                        if (node != null)
                        {
                            string authorUrl = node.GetAttributeValue("href", "");

                            int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0

                            Author author = CrawlUtil.getAuthor(context, authorId);

                            if (author == null)
                            {
                                author    = new Author();
                                author.id = authorId;

                                author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim());

                                book.Author = author;
                            }
                        }
                    }

                    review.Book = book;

                    context.SaveChanges();


                    CrawlReviewsOnUserProfile_Updated.count++;
                }

                CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile_Updated.count + " reviews crawled");
            }
        }