C# (CSharp) GoodReadsCrawler CrawlUtil Examples

Programming Language: C# (CSharp)

Namespace/Package Name: GoodReadsCrawler

Class/Type: CrawlUtil

Examples at hotexamples.com: 9

C# (CSharp) GoodReadsCrawler CrawlUtil - 9 examples found. These are the top rated real world C# (CSharp) examples of GoodReadsCrawler.CrawlUtil extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getNewContext(8)

getReview(3)

extractNumberFromString(3)

formatAuthorName(2)

extractIdNumberFromUrl(2)

getBook(2)

getAuthor(2)

extractUserIdStringFromUrl(2)

countStarsFromString(1)

getRatingFromClassString(1)

getMaxReviewIFramePageNumber(1)

extractUserIdFromString(1)

createOrGetAuthor(1)

extractNameFromString(1)

createOrGetUser(1)

createOrGetList(1)

createOrGetChallenge(1)

createOrGetBook(1)

getUser(1)

Example #1

Show file

File: CrawlReviewIFrame.cs Project: ron-t/breview-gr-crawler

        public static void Run(frmMain parentForm, Book book)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            currentBook = book;

            existingReviewIds = CrawlUtil.getNewContext().Reviews.Where(r => r.bookId == currentBook.id).Select(r => r.id).ToList();

            baseUri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&amp;format=html&amp;isbn=" + book.isbn + "&amp;links=660&amp;min_rating=&amp;review_back=fff&amp;stars=000&amp;text=000";

            c = new Crawler(new Uri(baseUri),
                            new HtmlDocumentProcessor(), // Process html
                            new ReviewIFrameDumperStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1; //** 2012-09-03 changed this from 2 to 1 in hopes that it'll fix the unknown (seemingly) random crashes.
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;

            // Begin crawl
            c.Crawl();
        }

Example #2

Show file

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            lock (this)
            {
                //there are 30 authors per page.
                //easier method is to get the count from the title
                if (propertyBag.ResponseUri.OriginalString == CrawlFavouriteAuthors.baseUri)
                {
                    var node = doc.SelectSingleNode("//title");

                    if (node != null)
                    {
                        string s = node.InnerText.Trim();
                        User.numFavouriteAuthors = Convert.ToInt32(CrawlUtil.extractNumberFromString(s.Substring(s.IndexOf(" authors"))));
                    }
                    else
                    {
                        User.numFavouriteAuthors = 0;
                    }
                }

                CrawlFavouriteAuthors.form.appendLineToLog(User.userIdString + ":: updated to have " + User.numFavouriteAuthors + " favourite authors.");
            }
        }

Example #3

Show file

File: CrawlReviewIFrame.cs Project: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }
            int maxPage = CrawlUtil.getMaxReviewIFramePageNumber(htmlDoc);

            //add other review pages if at base (uri) page
            if (propertyBag.ResponseUri.ToString() == CrawlReviewIFrame.baseUri && maxPage != -1)
            {
                int maxPageToCrawl = maxPage;

                string uri = "";
                //if (maxPage > 10)   commenting this out means to crawl all review pages.
                //{
                //    maxPageToCrawl = 10;
                //}

                for (int i = 2; i <= maxPageToCrawl; i++)
                {
                    uri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&amp;format=html&amp;isbn=" + CrawlReviewIFrame.currentBook.isbn + "&amp;links=660&amp;min_rating=&amp;page=" + i + "&amp;review_back=fff&amp;stars=000&amp;text=000";
                    crawler.AddStep(new Uri(uri), 0);
                }


                CrawlReviewIFrame.form.appendLineToLog("Crawling " + maxPageToCrawl + " pages of reviews for " + CrawlReviewIFrame.currentBook.getShortTitle());
            }

            //only process review iframe pages
            if (!propertyBag.ResponseUri.OriginalString.StartsWith(CrawlReviewIFrame.baseUri.Substring(0, 100)))
            {
                return;
            }

            lock (this)
            {
                string currentPage     = "0";
                var    currentPageNode = htmlDoc.DocumentNode.SelectSingleNode("//*[@class='current']");
                if (currentPageNode != null)
                {
                    currentPage = currentPageNode.InnerText.Trim();
                }

                var reviews = htmlDoc.DocumentNode.SelectNodes("//*[@itemtype='http://schema.org/Review']");

                if (reviews == null || reviews.Count == 0)
                {
                    return;
                }



                //**do stuff to handle dupes properly
                //                           -_-
                //current method just saves each review one by one and ignores all errors when trying to save.
                //this also means all reviews are attempted to be saved again no matter what :(
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var r in reviews)
                {
                    string   reviewUrl;
                    int      reviewId = -1;
                    Match    match;
                    var      reviewLinkNode = r.SelectSingleNode(".//div[@class='gr_review_text']/link[@itemprop='url']");
                    DateTime publishDate    = DateTime.MinValue;
                    short    starRating     = 0;

                    Review toAdd = new Review();
                    if (reviewLinkNode != null)
                    {
                        reviewUrl = reviewLinkNode.GetAttributeValue("href", "null");
                        match     = regReview.Match(reviewUrl);

                        if (Int32.TryParse(match.Groups[1].Value, out reviewId))
                        {
                            if (CrawlReviewIFrame.existingReviewIds.Contains(reviewId))
                            {
                                continue;
                            }

                            var node = r.SelectSingleNode(".//span[@class='gr_review_date']");
                            if (node != null)
                            {
                                DateTime.TryParse(node.InnerText, out publishDate);
                            }

                            node = r.SelectSingleNode(".//span[@class='gr_rating']");
                            if (node != null)
                            {
                                starRating = CrawlUtil.countStarsFromString(node.InnerText);
                            }

                            toAdd.id          = reviewId;
                            toAdd.bookId      = CrawlReviewIFrame.currentBook.id;
                            toAdd.publishDate = publishDate;
                            toAdd.starRating  = starRating;
                            toAdd.foundOnPage = Int32.Parse(currentPage);
                            toAdd.maxPage     = maxPage;

                            context.Reviews.AddObject(toAdd);
                        }

                        try
                        {
                            context.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            context.Reviews.Detach(toAdd);

                            CrawlReviewIFrame.form.appendLineToLog(ex.Message);
                            if (ex.InnerException != null)
                            {
                                CrawlReviewIFrame.form.appendLineToLog("\t" + ex.InnerException.Message);
                            }
                        }
                    }
                }

                CrawlReviewIFrame.form.appendLineToLog("Added " + reviews.Count + " on page " + currentPage + " of " + maxPage + " for " + CrawlReviewIFrame.currentBook.getShortTitle());
            }
        }

Example #4

Show file

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            //add pages only if at base uri page
            if (propertyBag.ResponseUri.ToString() == "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + CrawlList.FromPage)
            {
                string uri = "";

                for (int i = CrawlList.FromPage + 1; i <= CrawlList.ToPage; i++)
                {
                    uri = "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + i;
                    crawler.AddStep(new Uri(uri), 0);
                    CrawlList.form.appendLineToLog("also crawling " + uri);
                }
            }

            //only process list pages
            if (!propertyBag.ResponseUri.OriginalString.StartsWith("http://www.goodreads.com/list/show/1.Best_Books_Ever"))
            {
                return;
            }

            var s = propertyBag["HtmlDoc"].Value;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc != null)
            {
                lock (this)
                {
                    var books = htmlDoc.DocumentNode.SelectNodes("//tr[@itemtype='http://schema.org/Book\']");

                    if (books == null || books.Count == 0)
                    {
                        return;
                    }

                    GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();
                    foreach (var b in books)
                    {
                        string title         = "null";
                        string authorName    = "null";
                        var    titleURLNode  = b.SelectSingleNode(".//*[@class='bookTitle']");
                        var    authorURLNode = b.SelectSingleNode(".//*[@class='authorName']");
                        string titleUrl      = "null";
                        string authorUrl     = "null";
                        Match  match;
                        string bookId   = "-1";
                        string authorId = "-1";
                        Book   newBook  = null;
                        Author author   = null;


                        if (titleURLNode != null && authorURLNode != null)
                        {
                            titleUrl = titleURLNode.GetAttributeValue("href", "null");
                            match    = regBook.Match(titleUrl);
                            bookId   = match.Groups[1].Value;
                            title    = titleURLNode.InnerText.Trim();

                            authorUrl  = authorURLNode.GetAttributeValue("href", "null");
                            match      = regAuthor.Match(authorUrl);
                            authorId   = match.Groups[1].Value;
                            authorName = authorURLNode.InnerText.Trim();

                            author  = CrawlUtil.createOrGetAuthor(context, Int32.Parse(authorId), authorName);
                            newBook = CrawlUtil.createOrGetBook(context, Int32.Parse(bookId), title);

                            newBook.Author = author;
                            //author.Book = newBook;
                        }

                        context.SaveChanges();
                    }

                    CrawlList.form.appendLineToLog("added/updated " + books.Count + " books and their authors");
                }
            }
        }

Example #5

Show file

File: CrawlReviews.cs Project: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            //only process the review page
            if (propertyBag.ResponseUri.OriginalString != CrawlReviews.baseUri)
            {
                return;
            }


            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                Review review = CrawlUtil.getReview(context, ReviewId);

                if (review == null)
                {
                    return; //this should never happen
                }

                //userId	int	Checked
                //userIdString	varchar(256)	Checked
                //reviewContent	varchar(MAX)	Checked
                //starRating	smallint	Checked
                //publishDate	date	Checked
                //recommendedFor	varchar(MAX)	Checked
                //numComments	int	Checked

                HtmlNode doc = htmlDoc.DocumentNode;

                int      userId        = -1;
                string   userIdString  = null;
                string   userName      = null;
                string   reviewContent = null;
                short    starRating    = -1;
                DateTime publishDate;
                //string recommendedFor = null;
                int numComments = -1;

                var reviewAuthorNode = doc.SelectSingleNode("//a[@itemprop='author']");
                if (reviewAuthorNode != null)
                {
                    userIdString = reviewAuthorNode.GetAttributeValue("href", null);

                    userIdString = CrawlUtil.extractUserIdStringFromUrl(userIdString); //null if error
                    userId       = CrawlUtil.extractUserIdFromString(userIdString);    //-1 or 0 if error
                    userName     = reviewAuthorNode.InnerText.Trim();                  //empty if error

                    if (userIdString != null && userId > 0 && !String.IsNullOrEmpty(userName))
                    {
                        var user = CrawlUtil.createOrGetUser(context, userId, userIdString, userName);
                        review.User         = user;
                        review.userIdString = userIdString;
                    }

                    //<a href="/user/show/52663-mer" class="userReview" itemprop="author">Mer</a>
                }

                var reviewContentNode = doc.SelectSingleNode("//div[@itemprop='reviewBody']");
                if (reviewContentNode != null)
                {
                    review.reviewContent = reviewContentNode.InnerText.Trim();
                }

                var starRatingNode = doc.SelectSingleNode("//div[@class='rating']/span[@class='value-title']");
                if (starRatingNode != null)
                {
                    short.TryParse(starRatingNode.GetAttributeValue("title", ""), out starRating);

                    review.starRating = starRating;
                    //<span class="value-title" title="5"></span>
                }

                var publishDateNode = doc.SelectSingleNode("//div[@class='right dtreviewed greyText smallText']/span[@class='value-title']");
                if (publishDateNode != null)
                {
                    DateTime.TryParse(publishDateNode.GetAttributeValue("title", ""), out publishDate);

                    review.publishDate = publishDate;
                    //<span class="value-title" title="2007-04-28"></span>
                }

                var recomendedForNode = doc.SelectSingleNode("//span[text()='Recommended for:']");
                if (recomendedForNode != null)
                {
                    review.recommendedFor = recomendedForNode.ParentNode.LastChild.InnerText.Trim();

                    /*
                     * <div>
                     *  <span class="">Recommended for:</span>
                     *     enviornmentalists, nurturers, parents and children who want to discuss empathy and reciprocity
                     * </div>
                     */
                }

                //var numCommentsNode = doc.SelectSingleNode("");
                //if (numCommentsNode != null)
                //{
                //    /*

                //     *  <h2 class="brownBackground">
                //     *  <div class="extraHeader">
                //     *      <a href="#comment_form" rel="nofollow">Post a comment &raquo;</a>
                //     *  </div>Comments
                //     *  <span class="smallText">
                //            (showing
                //            1-5
                //            of
                //            5)
                //         </span>
                //     * ...
                //     * </h2>
                //     */
                //}

                context.SaveChanges();
                CrawlReviews.form.appendLineToLog("Added review " + review.id + " by user " + review.User.name);
            }
        }

Example #6

Show file

File: CrawlListAndVotes.cs Project: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlListAndVotes.baseUri)
            {
/*
 *
 * <div>
 * <span class="previous_page disabled">&laquo; previous</span>
 * <em class="current">1</em>
 * <a rel="next" href="/list/user_votes/1045275-natasha?page=2">2</a>
 * <a href="/list/user_votes/1045275-natasha?page=3">3</a>
 * <a href="/list/user_votes/1045275-natasha?page=4">4</a>
 * <a href="/list/user_votes/1045275-natasha?page=5">5</a>
 * <a class="next_page" rel="next" href="/list/user_votes/1045275-natasha?page=2">next &raquo;</a>
 * </div>
 */
                var node = doc.SelectSingleNode(".//a[@class='next_page' and @rel='next']");

                if (node != null)
                {
                    try
                    {
                        var x       = node.PreviousSibling.PreviousSibling;
                        int maxPage = Int32.Parse(x.InnerText.Trim());

                        string uri;
                        for (int i = 2; i <= maxPage; i++)
                        {
                            uri = "http://www.goodreads.com/list/user_votes/" + User.userIdString + "?page=" + i;
                            crawler.AddStep(new Uri(uri), 0);

                            CrawlListAndVotes.form.appendLineToLog(uri);
                        }
                    }
                    catch (Exception ex)
                    {
                        CrawlListAndVotes.form.appendLineToLog(ex.Message);
                    }
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var listNode in doc.SelectNodes(".//div[@class='cell']"))
                {
                    List   l         = null;
                    string title     = null;
                    var    titleNode = listNode.SelectSingleNode(".//a[@class='listTitle']");
                    if (titleNode != null)
                    {
                        title = titleNode.InnerText.Trim();
                    }

                    if (title != null)
                    {
                        l = CrawlUtil.createOrGetList(context, title);
                    }
                    else
                    {
                        continue;
                    }

/*
 *  296 books
 *  &mdash;
 *  994 voters
 */
                    var statsNode = listNode.SelectSingleNode(".//div[@class='listFullDetails']");
                    if (statsNode != null)
                    {
                        string s = statsNode.InnerText.Replace("\n", "").Trim();
                        l.numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));

                        s           = s.Substring(s.IndexOf("books"));
                        l.numVoters = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                    }

                    User u = CrawlUtil.getUser(context, User.id);
                    u.Lists.Add(l);

                    try
                    {
                        context.SaveChanges();
                        CrawlListAndVotes.count++;
                    }
                    catch (Exception ex)
                    {
                        User.Lists.Remove(l);
                        //this just prints out to check an inner exception which is a dupe PK error
                        //CrawlListAndVotes.form.appendLineToLog(ex.Message);
                    }
                }

                CrawlListAndVotes.form.appendLineToLog(User.userIdString + ":: " + CrawlListAndVotes.count + " lists added");
            }
        }

Example #7

Show file

File: CrawlUserProfile.cs Project: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();
                HtmlNodeCollection       nodes;
                HtmlNode node;

                //update user's status changes: private (default 0), author (default 0), or username
                var url = propertyBag.ResponseUri.AbsoluteUri;
                if (url != CrawlUserProfile.baseUri)        //user is author or has changed useridstring
                {
                    if (url == "http://www.goodreads.com/") //this means the user's profile no longer exists
                    {
                        CrawlUserProfile.form.appendLineToLog(string.Format("User {0} no longer exists."
                                                                            , User.userIdString
                                                                            ));
                        User.IsPrivate = true;
                    }

                    if (url.Contains("/author/"))
                    {
                        User.IsAuthor  = true;
                        User.authorUrl = url;
                    }

                    if (url.Contains("/show/")) //both users and authors can change usernames
                    {
                        //when users change their username it can be extracted from the new url.
                        if (User.IsAuthor != true)
                        {
                            User.userIdString = CrawlUtil.extractUserIdStringFromUrl(url);
                        }
                        else //if an author has changed username, then this cannot be detected from the authorurl (to which we are redirected)
                        {
                            //search for the new username using the user's id. e.g. for links to /user/show/37868189
                            node = doc.SelectSingleNode(".//a[starts-with(@href, '/user/show/" + User.id + "')]");
                            if (node != null)
                            {
                                var IdStringToExtract = node.GetAttributeValue("href", "");
                                User.userIdString = CrawlUtil.extractUserIdStringFromUrl(IdStringToExtract);
                            }
                        }
                    }


                    CrawlUserProfile.form.appendLineToLog(string.Format("User {0} updated with status IsAuthor = {1}| {2}; IsPrivate = {3}"
                                                                        , User.userIdString
                                                                        , User.IsAuthor
                                                                        , User.authorUrl
                                                                        , User.IsPrivate
                                                                        ));
                }



                //name
                node = doc.SelectSingleNode("//h1//text()[normalize-space()]");
                if (node != null)
                {
                    User.name = CrawlUtil.extractNameFromString(node.InnerText);
                    //inner text for user: "******"
                    //         for author: "Derek White"
                }

                /*
                 * <div class="leftContainer">
                 * <div class="leftAlignedImage" style="overflow: hidden; width: 110px;">
                 *  <a href="/photo/user/3255548-ah" rel="nofollow" title="AH">
                 *    <img alt="AH" src="http://d.gr-assets.com/users/1275413869p3/3255548.jpg" />
                 *  </a>
                 *  <div class="smallText">
                 *    <a href="/review/list/3255548?sort=rating&amp;view=reviews">874 ratings</a>
                 *    <a href="#" onclick="Element.toggle('ratingDistribution3255548');; new Ajax.Updater('ratingDistribution3255548', '/user/rating_distribution/3255548', {asynchronous:true, evalScripts:true, method:'get', onComplete:function(request){return false;}, parameters:'authenticity_token=' + encodeURIComponent('dlcB28CHfXju2vqnShahQlNLoL76d9c6QNZMYZI332g=')}); return false;">(3.82 avg)</a>
                 *    <div class="floatingBox" style="display:none; width: 400px;" id="ratingDistribution3255548"></div>
                 *    <br/>
                 *    <a href="/review/list/3255548?sort=review&amp;view=reviews">1218 reviews</a>
                 *    <br />
                 *    <a href="/photo/user/3255548-ah" rel="nofollow">more photos (3)</a>
                 *    <br/>
                 *    <br/>
                 #<a href="/user/best_reviewers?country=CA&amp;duration=a">10 best reviewers</a>
                 *    <br/>
                 #<a href="/user/top_reviewers?country=CA&amp;duration=a">36 top reviewers</a>
                 *
                 *  </div>
                 * </div>
                 * </div>
                 *
                 * 196 ratings
                 * (3.29 avg)
                 * 115 reviews
                 *
                 #10 best reviewers
                 #36 top reviewers
                 */
                //numRatings
                //numReviews
                //avgRating
                //badges
                //EXCEPT badges, this code works for both user and author pages
                nodes = doc.SelectNodes(".//div[@class='leftContainer']//div[@class='smallText']//text()[normalize-space()]");
                if (nodes == null)
                {
                    //private profiles still have this data but in a different HTML layout. If this 'different layout' is found
                    //then we assume this is a private profile.
                    User.IsPrivate = true;

                    CrawlUserProfile.form.appendLineToLog(string.Format("User {0} profile is deemed private."
                                                                        , User.userIdString
                                                                        ));
                    return;
                }
                if (nodes != null)
                {
                    int badgeNum = 1;

                    //to get numerical values and badges
                    foreach (var n in nodes)
                    {
                        string inner = n.InnerText.Trim();;

                        //if text begins with "//" then it's script text and we want to skip.
                        if (inner.StartsWith(@"//"))
                        {
                            continue;
                        }

                        //if text begins with "#" then it's a badge
                        if (inner.StartsWith("#"))// && n.NextSibling != null)
                        {
                            switch (badgeNum)
                            {
                            case 1:
                                User.badge1 = inner;
                                break;

                            case 2:
                                User.badge2 = inner;
                                break;

                            case 3:
                                User.badge3 = inner;
                                break;

                            case 4:
                                User.badge4 = inner;
                                break;

                            case 5:
                                User.badge5 = inner;
                                break;
                            }
                            badgeNum++;
                        }
                        else
                        {
                            decimal d = CrawlUtil.extractNumberFromString(inner);
                            if (d != -1)
                            {
                                if (inner.Contains("avg")) //author pages use "avg rating" which matches both "rating" and "avg" - so we match "avg" first
                                {
                                    User.avgRating = d;
                                }
                                else if (inner.Contains("rating"))
                                {
                                    User.numRatings = Convert.ToInt32(d);
                                }
                                else if (inner.Contains("review"))
                                {
                                    User.numReviews = Convert.ToInt32(d);
                                }
                            }
                        }
                    }
                }

                //to get the 'badges' for authors
                if (User.IsAuthor == true)
                {
                    nodes = doc.SelectNodes(".//div[@class='leftContainer']//div[@id='topListSection']//text()[normalize-space()]");
                    if (nodes != null)
                    {
                        int badgeNum = 1;
                        foreach (var n in nodes)
                        {
                            string inner = n.InnerText.Trim();

                            if (inner.StartsWith("#"))// && n.NextSibling != null)
                            {
                                switch (badgeNum)
                                {
                                case 1:
                                    User.badge1 = inner;
                                    break;

                                case 2:
                                    User.badge2 = inner;
                                    break;

                                case 3:
                                    User.badge3 = inner;
                                    break;

                                case 4:
                                    User.badge4 = inner;
                                    break;

                                case 5:
                                    User.badge5 = inner;
                                    break;
                                }
                                badgeNum++;
                            }
                        }
                    }
                }

                //<a href="/friend/user/104320-erin-beck" rel="nofollow">Erin’s Friends (4)</a>
                //<a rel="nofollow" href="/friend/user/3094317-tori-smexybooks-smexys-sidekick">(Tori-Smexybooks)’s Friends (1,505)</a>
                //numFriends
                //Works for both users and authors
                node = doc.SelectSingleNode(".//a[@href='/friend/user/" + User.userIdString + "']/text()");
                if (node != null)
                {
                    decimal d = CrawlUtil.extractNumberFromString(node.InnerText);

                    if (d != -1)
                    {
                        User.numFriends = Convert.ToInt32(d);
                    }
                }

                /* each friend's html
                 * <div class="left">
                 * <div class="friendName">
                 *  <a href="/user/show/355607-t-k-kenyon" rel="acquaintance">T.K. Kenyon</a>
                 * </div>
                 * 819 books
                 * <span class="greyText">|</span>
                 * 2,392 friends
                 * </div>
                 */
                //8 friends' summary details
                if (CrawlUserProfile.addFriends)
                {
                    nodes = doc.SelectNodes(".//div[@class='left']");

                    if (nodes != null)
                    {
                        foreach (var n in nodes) //each friend
                        {
                            Friendship f = new Friendship();
                            f.userId = this.User.id;

                            node = n.SelectSingleNode(".//div[@class='friendName']/a");
                            if (node != null)
                            {
                                string attr = node.GetAttributeValue("href", "");
                                f.friendIdString = CrawlUtil.extractUserIdStringFromUrl(attr);

                                f.rel = node.GetAttributeValue("rel", "");
                            }
                            else
                            {
                                continue;
                            }

                            node = n.SelectSingleNode("./text()[normalize-space()]"); //number of books
                            if (node != null)
                            {
                                int numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(node.InnerText));

                                if (numBooks != -1)
                                {
                                    f.friendNumBooks = numBooks;
                                }

                                node = node.SelectSingleNode("following-sibling::text()"); //number of friends
                                if (node != null)
                                {
                                    int numFriends = Convert.ToInt32(CrawlUtil.extractNumberFromString(node.InnerText));

                                    if (numFriends != -1)
                                    {
                                        f.friendNumFriends = numFriends;
                                    }
                                }
                            }

                            try
                            {
                                context.Friendships.AddObject(f);
                                context.SaveChanges();
                            }
                            catch (Exception)
                            {
                                context.Friendships.Detach(f);
                            }
                        }
                    }
                }

                //User.numFollowers
                //Users (non-authors) <a class="actionLink" rel="nofollow" href="/user/3094317-tori-smexybooks-smexys-sidekick/followers">319 people are following (Tori-Smexybooks)</a>
                //authors <a href="/author_followings?id=6458332&amp;method=get">Lucas Lyndes’s Followers (8)</a>

                if (User.IsAuthor == true)
                {
                    node = doc.SelectSingleNode(".//a[starts-with(@href, '/author_followings?')]/text()");
                }
                else
                {
                    node = doc.SelectSingleNode(".//a[@href='/user/" + User.userIdString + "/followers']/text()");
                }
                if (node != null)
                {
                    decimal d = CrawlUtil.extractNumberFromString(node.InnerText);

                    if (d != -1)
                    {
                        User.numFollowers = Convert.ToInt32(d);
                    }
                }


                //User.numUserIsFollowing - see [done]goodreads-numUserIsFollowing logic
                //N/A for authors
                if (User.IsAuthor == true)
                {
                    User.numUserIsFollowing = null;
                }
                else
                {
                    nodes = doc.SelectNodes(".//a[contains(text(),'is Following')]/../../../div[@class='bigBoxBody']//div/a");
                    if (nodes != null)
                    {
                        User.numUserIsFollowing = nodes.Count;
                    }
                }


                //see [done]goodreads-quiz logic.html
                //User.quizNumCorrect
                //User.quizNumQuestions
                //User.quizRank
                //User.quizRankOutOf
                //N/A for authors
                if (User.IsAuthor == true)
                {
                    User.quizNumCorrect   = null;
                    User.quizNumQuestions = null;
                    User.quizRank         = null;
                    User.quizRankOutOf    = null;
                }
                else
                {
                    nodes = doc.SelectNodes(".//div[@id='myQuizStats']//div[@class='infoBoxRowTitle' or @class='infoBoxRowItem']");
                    if (nodes != null)
                    {
                        string s = "";
                        foreach (var n in nodes)
                        {
                            if (n.InnerText.Contains("questions answered"))
                            {
                                s = n.NextSibling.NextSibling.InnerText;
                                User.quizNumQuestions = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                //CrawlUserProfile.form.appendLineToLog("answered: " + CrawlUtil.extractNumberFromString(s) + " (" + s + ")");
                            }

                            else if (n.InnerText.Contains("correct"))
                            {
                                s = n.NextSibling.NextSibling.InnerText;
                                User.quizNumCorrect = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                //CrawlUserProfile.form.appendLineToLog("correct: " + CrawlUtil.extractNumberFromString(s) + " (" + s + ")");
                            }

                            else if (n.InnerText.Contains("ranking"))
                            {
                                s             = n.NextSibling.NextSibling.InnerText;
                                User.quizRank = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                //CrawlUserProfile.form.appendLineToLog("ranking: " + CrawlUtil.extractNumberFromString(s) + " (" + s + ")");

                                s = s.Substring(s.IndexOf("out of"));
                                User.quizRankOutOf = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                //CrawlUserProfile.form.appendLineToLog("of: " + CrawlUtil.extractNumberFromString(s));
                            }
                        }
                    }
                }


                //User.ReadingChallenges - see [done]goodreads-challenge logic.html
                //CHECK if working for authors
                if (CrawlUserProfile.addChallenges)
                {
                    nodes = doc.SelectNodes(".//div[@class='challengePic']");

                    if (nodes != null)
                    {
                        foreach (var n in nodes)
                        {
                            var challengePic = n.SelectSingleNode(".//img");

                            if (challengePic != null)
                            {
                                string challenge = challengePic.GetAttributeValue("alt", "unknown");

                                if (challenge != "unknown")
                                {
                                    ReadingChallenge rc = CrawlUtil.createOrGetChallenge(context, User.id, challenge);
                                    try
                                    {
                                        var stats = n.NextSibling.NextSibling.SelectSingleNode(".//div[@class='bookMeta progressStats']");

                                        if (stats != null)
                                        {
                                            string s = stats.InnerText;
                                            rc.numBooksRead = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));

                                            s = s.Substring(s.IndexOf(" of "));
                                            rc.numBooksTarget = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                        }
                                        else
                                        {
                                            stats = n.NextSibling.NextSibling.SelectSingleNode(".//a[@class='challengeBooksRead']");

                                            if (stats != null)
                                            {
                                                string s = stats.InnerText;
                                                rc.numBooksRead = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));

                                                s = s.Substring(s.IndexOf(" of "));
                                                rc.numBooksTarget = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                            }
                                        }

                                        rc.lastUpdated = DateTime.Now;

                                        context.SaveChanges();
                                    }
                                    catch (Exception ex)
                                    {
                                        context.ReadingChallenges.Detach(rc);
                                    }
                                }
                            }
                        }
                    }
                }

                /*
                 * <a href="/librarian/user/255233-sonic" rel="nofollow">Goodreads librarian</a>
                 */
                //User.isLibrarian
                node = doc.SelectSingleNode(".//a[@href='/librarian/user/" + User.userIdString + "']");

                if (node != null && node.InnerText == "Goodreads librarian")
                {
                    User.IsLibrarian = true;
                }
                else
                {
                    User.IsLibrarian = false;
                }

                //User.Genres - see [done]goodreads-genres (fav) logic.html
                //UNCHECKED for round 5 update
                if (CrawlUserProfile.addGenres)
                {
                    nodes = doc.SelectNodes(".//h2[contains(text(),'Favorite Genres')]/../../div[@class='bigBoxBody']//a");

                    if (nodes != null)
                    {
                        foreach (var n in nodes)
                        {
                            Genre g = new Genre();
                            g.name   = n.InnerText.Trim();
                            g.userId = User.id;

                            try
                            {
                                context.AddToGenres(g);
                                context.SaveChanges();
                            }
                            catch (Exception)
                            {
                                context.Genres.Detach(g);
                            }
                        }
                    }
                }

                //User.Activities - see [done]goodreads-updates logic.html
                //UNCHECKED for round 5 update
                if (CrawlUserProfile.addActivities)
                {
                    nodes = doc.SelectNodes(".//table[@class='tableListReverse friendUpdates']/tr[@class='update' or @class='no_border']");

                    if (nodes != null)
                    {
                        foreach (var n in nodes)
                        {
                            if ("update" == n.GetAttributeValue("class", "unknown"))
                            {
                                Activity a = new Activity();
                                try
                                {
                                    a.userId = User.id;

                                    a.activityHTML = n.InnerHtml.Length > 8000 ? n.InnerHtml.Substring(0, 7999) : n.InnerHtml;

                                    var ts = n.SelectSingleNode("following-sibling::tr//a[@class='updatedTimestamp']");
                                    a.activityTimestampString = ts.InnerText;
                                    a.retrievedAt             = DateTime.Now;

                                    context.AddToActivities(a);
                                    context.SaveChanges();
                                }
                                catch (Exception ex)
                                {
                                    context.Activities.Detach(a);
                                }
                            }
                        }
                    }
                }

                //separate function/class User.numFavouriteAuthors
                //separate function/class User.Groups
                //separate function/class User.Lists
                //separate function/class User.numShelves
            }

            CrawlUserProfile.form.appendLineToLog(User.userIdString + ":: " + "details and ticked updated.");
        }

Example #8

Show file

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            string temp  = propertyBag.ResponseUri.OriginalString;
            string temp2 = CrawlReviewsOnUserProfile.baseUri.Substring(0, 54 + User.userIdString.Length);

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile.baseUri)
            {
                var node = doc.SelectSingleNode(".//div[@id='reviewPagination']");

                if (node != null)
                {
                    try
                    {
                        var x       = node.LastChild.PreviousSibling.PreviousSibling;
                        int maxPage = Int32.Parse(x.InnerText.Trim());

                        string uri;
                        for (int i = 2; i <= maxPage; i++)
                        {
                            uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?format=html&page=" + i + "&per_page=100&shelf=read";
                            crawler.AddStep(new Uri(uri), 0);

                            CrawlReviewsOnUserProfile.form.appendLineToLog(uri);
                        }
                    }
                    catch (Exception)
                    {
                    }
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']"))
                {
                    CrawlReviewsOnUserProfile.count++;

                    string reviewIdString = reviewNode.GetAttributeValue("id", "");

                    if (reviewIdString == "")
                    {
                        return;
                    }

                    int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString());

                    Review review = CrawlUtil.getReview(context, reviewId);

                    //create and process the REVIEW if it doesn't already exist
                    if (review == null)
                    {
                        HtmlNode node;
                        review = new Review();

                        review.id = reviewId;

                        //REVIEW.rating
                        node = reviewNode.SelectSingleNode(".//td[@class='field rating']//img");
                        if (node != null)
                        {
                            string ratingString = node.GetAttributeValue("alt", "0");
                            short  rating       = short.Parse(ratingString.Substring(0, 1));

                            review.starRating = rating;
                        }

                        //REVIEW.publishdate
                        node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span");
                        if (node != null)
                        {
                            DateTime date;
                            DateTime.TryParse(node.InnerText, out date);

                            review.publishDate = date;
                        }

                        //USER
                        review.userId       = User.id;
                        review.userIdString = User.userIdString;

                        //BOOK
                        node = reviewNode.SelectSingleNode(".//td[@class='field title']//a");
                        string bookUrl = node.GetAttributeValue("href", "");

                        int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0

                        Book book = CrawlUtil.getBook(context, bookId);

                        if (book == null)
                        {
                            book    = new Book();
                            book.id = bookId;

                            string title = node.GetAttributeValue("title", "");
                            book.title = title;

                            node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div");
                            if (node != null)
                            {
                                book.isbn = node.InnerText.Trim();
                            }

                            //AUTHOR
                            node = reviewNode.SelectSingleNode(".//td[@class='field author']//a");
                            if (node != null)
                            {
                                string authorUrl = node.GetAttributeValue("href", "");

                                int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0

                                Author author = CrawlUtil.getAuthor(context, authorId);

                                if (author == null)
                                {
                                    author    = new Author();
                                    author.id = authorId;

                                    author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim());

                                    book.Author = author;
                                }
                            }
                        }

                        review.Book = book;

                        context.SaveChanges();
                    }
                }

                CrawlReviewsOnUserProfile.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile.count + " reviews crawled");
            }
        }

Example #9

Show file

File: CrawlReviewsOnUserProfile_Updated.cs Project: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            string temp  = propertyBag.ResponseUri.OriginalString;
            string temp2 = CrawlReviewsOnUserProfile_Updated.baseUri.Substring(0, 54 + User.userIdString.Length);

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile_Updated.baseUri)
            {
                var node = doc.SelectSingleNode(".//div[@id='reviewPagination']");

                if (node != null)
                {
                    HtmlNode maxPageNode;
                    int      maxPage = 0;

                    try
                    {
                        maxPageNode = node.LastChild.PreviousSibling.PreviousSibling;
                        maxPage     = Int32.Parse(maxPageNode.InnerText.Trim());
                    }
                    catch (Exception)
                    {
                        CrawlReviewsOnUserProfile_Updated.form.appendLineToLog("Error getting maxPage on " + propertyBag.ResponseUri.OriginalString);
                        return;
                    }

                    //get new reviews since last crawl?
                    int pagesToCrawl = 0;
                    if (CrawlReviewsOnUserProfile_Updated.getReviewsSinceLastCrawl)
                    {
                        pagesToCrawl = maxPage - (User.Reviews.Count / 20); //int division results in truncation
                        if (pagesToCrawl < 1)
                        {
                            return;
                        }

                        /**** TEMP to get pages 30 and above (for users with more than 600 reviews (after getting up to 600 only on a previous run))****/
                        //for (int i = 30; i <= maxPage; i++)
                        //{
                        //    String s= "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                        //    crawler.AddStep(new Uri(s), 0);

                        //    CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(s);
                        //}
                        //return;

                        /*** Old logic pre 2015 11 30 ***
                         * int startPage = (User.Reviews.Count / 20)+1;
                         * string uri;
                         * for (int i = startPage; i <= maxPage; i++)
                         * {
                         *  uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                         *  crawler.AddStep(new Uri(uri), 0);
                         *
                         *  CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri);
                         * }
                         * return;
                         * *************/
                    }
                    else //crawl every page
                    {
                        pagesToCrawl = maxPage;
                    }

                    string uri;
                    for (int i = 2; i <= pagesToCrawl; i++)
                    {
                        //http://www.goodreads.com/review/list/1-otis-chandler?page=3&print=true&shelf=read
                        uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?page=" + i + "&print=true&shelf=read";
                        crawler.AddStep(new Uri(uri), 0);

                        CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri);
                    }
                    //continue to with crawl on page 1 unless endPage is 0 (i.e. no pages need to be crawled)


                    //2015-11-30: getting X latest pages is now redudant since reviews are now always sorted by date added ascending.
                    //feature removed for 2015 update 5 crawl

                    //get reviews from specified pages; or latest X pages
                    //get user's latest X pages of reviews if user has more than (maxPage * 20) reviews; X determined by getHowManyLatestPages
                    //if (maxPage > CrawlReviewsOnUserProfile_Updated.maxPage)
                    //{
                    //    if (CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages > 0)
                    //    {
                    //        int numLatestPages = Math.Min(CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages, maxPage - CrawlReviewsOnUserProfile_Updated.maxPage);
                    //        for (int i = 0; i < numLatestPages; i++)
                    //        {
                    //            string uriLatest = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + (maxPage - i) + "&print=true&shelf=read&sort=date_added&view=reviews";
                    //            crawler.AddStep(new Uri(uriLatest), 0);

                    //            CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uriLatest);
                    //        }
                    //    }

                    //    maxPage = CrawlReviewsOnUserProfile_Updated.maxPage;
                    //}

                    //string u;
                    //for (int i = CrawlReviewsOnUserProfile_Updated.minPage; i <= maxPage; i++)
                    //{
                    //    u = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                    //    crawler.AddStep(new Uri(u), 0);

                    //    CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(u);
                    //}

                    //if don't want to include page 1 then don't crawl after adding other pages to crawl
                    //if (CrawlReviewsOnUserProfile_Updated.minPage > 1)
                    //{
                    //    return;
                    //}
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']"))
                {
                    string reviewIdString = reviewNode.GetAttributeValue("id", "");

                    if (reviewIdString == "")
                    {
                        return;
                    }

                    int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString());

                    //Review review = CrawlUtil.createOrGetReview(context, reviewId);
                    Review review = CrawlUtil.getReview(context, reviewId);

                    if (review == null) //review is new
                    {
                        review    = new Review();
                        review.id = reviewId;

                        context.Reviews.AddObject(review);
                    }
                    else //review already exists
                    {
                        continue;
                    }

                    HtmlNode node;

                    //REVIEW.rating

                    /*<td class="field rating">
                     *  <label>Reb's rating</label>
                     *  <div class="value">
                     *      <a class=" staticStars stars_4" title="really liked it">4 of 5 stars</a>
                     *  </div>
                     * </td>*/
                    node = reviewNode.SelectSingleNode(".//td[@class='field rating']//a");
                    if (node != null)
                    {
                        string ratingClassString = node.GetAttributeValue("class", "0");
                        short  rating            = CrawlUtil.getRatingFromClassString(ratingClassString);

                        review.starRating = rating;
                    }

                    //REVIEW.publishdate
                    node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span");
                    if (node != null)
                    {
                        DateTime date;
                        DateTime.TryParse(node.InnerText, out date);

                        review.publishDate = date;
                    }

                    //USER
                    review.userId       = User.id;
                    review.userIdString = User.userIdString;

                    //BOOK
                    node = reviewNode.SelectSingleNode(".//td[@class='field title']//a");
                    string bookUrl = node.GetAttributeValue("href", "");

                    int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0

                    Book book = CrawlUtil.getBook(context, bookId);

                    if (book == null)
                    {
                        book    = new Book();
                        book.id = bookId;

                        string title = node.GetAttributeValue("title", "");
                        book.title = title;

                        node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div");
                        if (node != null)
                        {
                            book.isbn = node.InnerText.Trim();
                        }

                        //AUTHOR
                        node = reviewNode.SelectSingleNode(".//td[@class='field author']//a");
                        if (node != null)
                        {
                            string authorUrl = node.GetAttributeValue("href", "");

                            int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0

                            Author author = CrawlUtil.getAuthor(context, authorId);

                            if (author == null)
                            {
                                author    = new Author();
                                author.id = authorId;

                                author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim());

                                book.Author = author;
                            }
                        }
                    }

                    review.Book = book;

                    context.SaveChanges();


                    CrawlReviewsOnUserProfile_Updated.count++;
                }

                CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile_Updated.count + " reviews crawled");
            }
        }