public static void Run(frmMain parentForm, Book book)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            currentBook = book;

            existingReviewIds = CrawlUtil.getNewContext().Reviews.Where(r => r.bookId == currentBook.id).Select(r => r.id).ToList();

            baseUri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&format=html&isbn=" + book.isbn + "&links=660&min_rating=&review_back=fff&stars=000&text=000";

            c = new Crawler(new Uri(baseUri),
                            new HtmlDocumentProcessor(), // Process html
                            new ReviewIFrameDumperStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1; //** 2012-09-03 changed this from 2 to 1 in hopes that it'll fix the unknown (seemingly) random crashes.
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;

            // Begin crawl
            c.Crawl();
        }
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }
            int maxPage = CrawlUtil.getMaxReviewIFramePageNumber(htmlDoc);

            //add other review pages if at base (uri) page
            if (propertyBag.ResponseUri.ToString() == CrawlReviewIFrame.baseUri && maxPage != -1)
            {
                int maxPageToCrawl = maxPage;

                string uri = "";
                //if (maxPage > 10)   commenting this out means to crawl all review pages.
                //{
                //    maxPageToCrawl = 10;
                //}

                for (int i = 2; i <= maxPageToCrawl; i++)
                {
                    uri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&amp;format=html&amp;isbn=" + CrawlReviewIFrame.currentBook.isbn + "&amp;links=660&amp;min_rating=&amp;page=" + i + "&amp;review_back=fff&amp;stars=000&amp;text=000";
                    crawler.AddStep(new Uri(uri), 0);
                }


                CrawlReviewIFrame.form.appendLineToLog("Crawling " + maxPageToCrawl + " pages of reviews for " + CrawlReviewIFrame.currentBook.getShortTitle());
            }

            //only process review iframe pages
            if (!propertyBag.ResponseUri.OriginalString.StartsWith(CrawlReviewIFrame.baseUri.Substring(0, 100)))
            {
                return;
            }

            lock (this)
            {
                string currentPage     = "0";
                var    currentPageNode = htmlDoc.DocumentNode.SelectSingleNode("//*[@class='current']");
                if (currentPageNode != null)
                {
                    currentPage = currentPageNode.InnerText.Trim();
                }

                var reviews = htmlDoc.DocumentNode.SelectNodes("//*[@itemtype='http://schema.org/Review']");

                if (reviews == null || reviews.Count == 0)
                {
                    return;
                }



                //**do stuff to handle dupes properly
                //                           -_-
                //current method just saves each review one by one and ignores all errors when trying to save.
                //this also means all reviews are attempted to be saved again no matter what :(
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var r in reviews)
                {
                    string   reviewUrl;
                    int      reviewId = -1;
                    Match    match;
                    var      reviewLinkNode = r.SelectSingleNode(".//div[@class='gr_review_text']/link[@itemprop='url']");
                    DateTime publishDate    = DateTime.MinValue;
                    short    starRating     = 0;

                    Review toAdd = new Review();
                    if (reviewLinkNode != null)
                    {
                        reviewUrl = reviewLinkNode.GetAttributeValue("href", "null");
                        match     = regReview.Match(reviewUrl);

                        if (Int32.TryParse(match.Groups[1].Value, out reviewId))
                        {
                            if (CrawlReviewIFrame.existingReviewIds.Contains(reviewId))
                            {
                                continue;
                            }

                            var node = r.SelectSingleNode(".//span[@class='gr_review_date']");
                            if (node != null)
                            {
                                DateTime.TryParse(node.InnerText, out publishDate);
                            }

                            node = r.SelectSingleNode(".//span[@class='gr_rating']");
                            if (node != null)
                            {
                                starRating = CrawlUtil.countStarsFromString(node.InnerText);
                            }

                            toAdd.id          = reviewId;
                            toAdd.bookId      = CrawlReviewIFrame.currentBook.id;
                            toAdd.publishDate = publishDate;
                            toAdd.starRating  = starRating;
                            toAdd.foundOnPage = Int32.Parse(currentPage);
                            toAdd.maxPage     = maxPage;

                            context.Reviews.AddObject(toAdd);
                        }

                        try
                        {
                            context.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            context.Reviews.Detach(toAdd);

                            CrawlReviewIFrame.form.appendLineToLog(ex.Message);
                            if (ex.InnerException != null)
                            {
                                CrawlReviewIFrame.form.appendLineToLog("\t" + ex.InnerException.Message);
                            }
                        }
                    }
                }

                CrawlReviewIFrame.form.appendLineToLog("Added " + reviews.Count + " on page " + currentPage + " of " + maxPage + " for " + CrawlReviewIFrame.currentBook.getShortTitle());
            }
        }
Exemple #3
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            //add pages only if at base uri page
            if (propertyBag.ResponseUri.ToString() == "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + CrawlList.FromPage)
            {
                string uri = "";

                for (int i = CrawlList.FromPage + 1; i <= CrawlList.ToPage; i++)
                {
                    uri = "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + i;
                    crawler.AddStep(new Uri(uri), 0);
                    CrawlList.form.appendLineToLog("also crawling " + uri);
                }
            }

            //only process list pages
            if (!propertyBag.ResponseUri.OriginalString.StartsWith("http://www.goodreads.com/list/show/1.Best_Books_Ever"))
            {
                return;
            }

            var s = propertyBag["HtmlDoc"].Value;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc != null)
            {
                lock (this)
                {
                    var books = htmlDoc.DocumentNode.SelectNodes("//tr[@itemtype='http://schema.org/Book\']");

                    if (books == null || books.Count == 0)
                    {
                        return;
                    }

                    GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();
                    foreach (var b in books)
                    {
                        string title         = "null";
                        string authorName    = "null";
                        var    titleURLNode  = b.SelectSingleNode(".//*[@class='bookTitle']");
                        var    authorURLNode = b.SelectSingleNode(".//*[@class='authorName']");
                        string titleUrl      = "null";
                        string authorUrl     = "null";
                        Match  match;
                        string bookId   = "-1";
                        string authorId = "-1";
                        Book   newBook  = null;
                        Author author   = null;


                        if (titleURLNode != null && authorURLNode != null)
                        {
                            titleUrl = titleURLNode.GetAttributeValue("href", "null");
                            match    = regBook.Match(titleUrl);
                            bookId   = match.Groups[1].Value;
                            title    = titleURLNode.InnerText.Trim();

                            authorUrl  = authorURLNode.GetAttributeValue("href", "null");
                            match      = regAuthor.Match(authorUrl);
                            authorId   = match.Groups[1].Value;
                            authorName = authorURLNode.InnerText.Trim();

                            author  = CrawlUtil.createOrGetAuthor(context, Int32.Parse(authorId), authorName);
                            newBook = CrawlUtil.createOrGetBook(context, Int32.Parse(bookId), title);

                            newBook.Author = author;
                            //author.Book = newBook;
                        }

                        context.SaveChanges();
                    }

                    CrawlList.form.appendLineToLog("added/updated " + books.Count + " books and their authors");
                }
            }
        }
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            //only process the review page
            if (propertyBag.ResponseUri.OriginalString != CrawlReviews.baseUri)
            {
                return;
            }


            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                Review review = CrawlUtil.getReview(context, ReviewId);

                if (review == null)
                {
                    return; //this should never happen
                }

                //userId	int	Checked
                //userIdString	varchar(256)	Checked
                //reviewContent	varchar(MAX)	Checked
                //starRating	smallint	Checked
                //publishDate	date	Checked
                //recommendedFor	varchar(MAX)	Checked
                //numComments	int	Checked

                HtmlNode doc = htmlDoc.DocumentNode;

                int      userId        = -1;
                string   userIdString  = null;
                string   userName      = null;
                string   reviewContent = null;
                short    starRating    = -1;
                DateTime publishDate;
                //string recommendedFor = null;
                int numComments = -1;

                var reviewAuthorNode = doc.SelectSingleNode("//a[@itemprop='author']");
                if (reviewAuthorNode != null)
                {
                    userIdString = reviewAuthorNode.GetAttributeValue("href", null);

                    userIdString = CrawlUtil.extractUserIdStringFromUrl(userIdString); //null if error
                    userId       = CrawlUtil.extractUserIdFromString(userIdString);    //-1 or 0 if error
                    userName     = reviewAuthorNode.InnerText.Trim();                  //empty if error

                    if (userIdString != null && userId > 0 && !String.IsNullOrEmpty(userName))
                    {
                        var user = CrawlUtil.createOrGetUser(context, userId, userIdString, userName);
                        review.User         = user;
                        review.userIdString = userIdString;
                    }

                    //<a href="/user/show/52663-mer" class="userReview" itemprop="author">Mer</a>
                }

                var reviewContentNode = doc.SelectSingleNode("//div[@itemprop='reviewBody']");
                if (reviewContentNode != null)
                {
                    review.reviewContent = reviewContentNode.InnerText.Trim();
                }

                var starRatingNode = doc.SelectSingleNode("//div[@class='rating']/span[@class='value-title']");
                if (starRatingNode != null)
                {
                    short.TryParse(starRatingNode.GetAttributeValue("title", ""), out starRating);

                    review.starRating = starRating;
                    //<span class="value-title" title="5"></span>
                }

                var publishDateNode = doc.SelectSingleNode("//div[@class='right dtreviewed greyText smallText']/span[@class='value-title']");
                if (publishDateNode != null)
                {
                    DateTime.TryParse(publishDateNode.GetAttributeValue("title", ""), out publishDate);

                    review.publishDate = publishDate;
                    //<span class="value-title" title="2007-04-28"></span>
                }

                var recomendedForNode = doc.SelectSingleNode("//span[text()='Recommended for:']");
                if (recomendedForNode != null)
                {
                    review.recommendedFor = recomendedForNode.ParentNode.LastChild.InnerText.Trim();

                    /*
                     * <div>
                     *  <span class="">Recommended for:</span>
                     *     enviornmentalists, nurturers, parents and children who want to discuss empathy and reciprocity
                     * </div>
                     */
                }

                //var numCommentsNode = doc.SelectSingleNode("");
                //if (numCommentsNode != null)
                //{
                //    /*

                //     *  <h2 class="brownBackground">
                //     *  <div class="extraHeader">
                //     *      <a href="#comment_form" rel="nofollow">Post a comment &raquo;</a>
                //     *  </div>Comments
                //     *  <span class="smallText">
                //            (showing
                //            1-5
                //            of
                //            5)
                //         </span>
                //     * ...
                //     * </h2>
                //     */
                //}

                context.SaveChanges();
                CrawlReviews.form.appendLineToLog("Added review " + review.id + " by user " + review.User.name);
            }
        }
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlListAndVotes.baseUri)
            {
/*
 *
 * <div>
 * <span class="previous_page disabled">&laquo; previous</span>
 * <em class="current">1</em>
 * <a rel="next" href="/list/user_votes/1045275-natasha?page=2">2</a>
 * <a href="/list/user_votes/1045275-natasha?page=3">3</a>
 * <a href="/list/user_votes/1045275-natasha?page=4">4</a>
 * <a href="/list/user_votes/1045275-natasha?page=5">5</a>
 * <a class="next_page" rel="next" href="/list/user_votes/1045275-natasha?page=2">next &raquo;</a>
 * </div>
 */
                var node = doc.SelectSingleNode(".//a[@class='next_page' and @rel='next']");

                if (node != null)
                {
                    try
                    {
                        var x       = node.PreviousSibling.PreviousSibling;
                        int maxPage = Int32.Parse(x.InnerText.Trim());

                        string uri;
                        for (int i = 2; i <= maxPage; i++)
                        {
                            uri = "http://www.goodreads.com/list/user_votes/" + User.userIdString + "?page=" + i;
                            crawler.AddStep(new Uri(uri), 0);

                            CrawlListAndVotes.form.appendLineToLog(uri);
                        }
                    }
                    catch (Exception ex)
                    {
                        CrawlListAndVotes.form.appendLineToLog(ex.Message);
                    }
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var listNode in doc.SelectNodes(".//div[@class='cell']"))
                {
                    List   l         = null;
                    string title     = null;
                    var    titleNode = listNode.SelectSingleNode(".//a[@class='listTitle']");
                    if (titleNode != null)
                    {
                        title = titleNode.InnerText.Trim();
                    }

                    if (title != null)
                    {
                        l = CrawlUtil.createOrGetList(context, title);
                    }
                    else
                    {
                        continue;
                    }

/*
 *  296 books
 *  &mdash;
 *  994 voters
 */
                    var statsNode = listNode.SelectSingleNode(".//div[@class='listFullDetails']");
                    if (statsNode != null)
                    {
                        string s = statsNode.InnerText.Replace("\n", "").Trim();
                        l.numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));

                        s           = s.Substring(s.IndexOf("books"));
                        l.numVoters = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                    }

                    User u = CrawlUtil.getUser(context, User.id);
                    u.Lists.Add(l);

                    try
                    {
                        context.SaveChanges();
                        CrawlListAndVotes.count++;
                    }
                    catch (Exception ex)
                    {
                        User.Lists.Remove(l);
                        //this just prints out to check an inner exception which is a dupe PK error
                        //CrawlListAndVotes.form.appendLineToLog(ex.Message);
                    }
                }

                CrawlListAndVotes.form.appendLineToLog(User.userIdString + ":: " + CrawlListAndVotes.count + " lists added");
            }
        }
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();
                HtmlNodeCollection       nodes;
                HtmlNode node;

                //update user's status changes: private (default 0), author (default 0), or username
                var url = propertyBag.ResponseUri.AbsoluteUri;
                if (url != CrawlUserProfile.baseUri)        //user is author or has changed useridstring
                {
                    if (url == "http://www.goodreads.com/") //this means the user's profile no longer exists
                    {
                        CrawlUserProfile.form.appendLineToLog(string.Format("User {0} no longer exists."
                                                                            , User.userIdString
                                                                            ));
                        User.IsPrivate = true;
                    }

                    if (url.Contains("/author/"))
                    {
                        User.IsAuthor  = true;
                        User.authorUrl = url;
                    }

                    if (url.Contains("/show/")) //both users and authors can change usernames
                    {
                        //when users change their username it can be extracted from the new url.
                        if (User.IsAuthor != true)
                        {
                            User.userIdString = CrawlUtil.extractUserIdStringFromUrl(url);
                        }
                        else //if an author has changed username, then this cannot be detected from the authorurl (to which we are redirected)
                        {
                            //search for the new username using the user's id. e.g. for links to /user/show/37868189
                            node = doc.SelectSingleNode(".//a[starts-with(@href, '/user/show/" + User.id + "')]");
                            if (node != null)
                            {
                                var IdStringToExtract = node.GetAttributeValue("href", "");
                                User.userIdString = CrawlUtil.extractUserIdStringFromUrl(IdStringToExtract);
                            }
                        }
                    }


                    CrawlUserProfile.form.appendLineToLog(string.Format("User {0} updated with status IsAuthor = {1}| {2}; IsPrivate = {3}"
                                                                        , User.userIdString
                                                                        , User.IsAuthor
                                                                        , User.authorUrl
                                                                        , User.IsPrivate
                                                                        ));
                }



                //name
                node = doc.SelectSingleNode("//h1//text()[normalize-space()]");
                if (node != null)
                {
                    User.name = CrawlUtil.extractNameFromString(node.InnerText);
                    //inner text for user: "******"
                    //         for author: "Derek White"
                }

                /*
                 * <div class="leftContainer">
                 * <div class="leftAlignedImage" style="overflow: hidden; width: 110px;">
                 *  <a href="/photo/user/3255548-ah" rel="nofollow" title="AH">
                 *    <img alt="AH" src="http://d.gr-assets.com/users/1275413869p3/3255548.jpg" />
                 *  </a>
                 *  <div class="smallText">
                 *    <a href="/review/list/3255548?sort=rating&amp;view=reviews">874 ratings</a>
                 *    <a href="#" onclick="Element.toggle('ratingDistribution3255548');; new Ajax.Updater('ratingDistribution3255548', '/user/rating_distribution/3255548', {asynchronous:true, evalScripts:true, method:'get', onComplete:function(request){return false;}, parameters:'authenticity_token=' + encodeURIComponent('dlcB28CHfXju2vqnShahQlNLoL76d9c6QNZMYZI332g=')}); return false;">(3.82 avg)</a>
                 *    <div class="floatingBox" style="display:none; width: 400px;" id="ratingDistribution3255548"></div>
                 *    <br/>
                 *    <a href="/review/list/3255548?sort=review&amp;view=reviews">1218 reviews</a>
                 *    <br />
                 *    <a href="/photo/user/3255548-ah" rel="nofollow">more photos (3)</a>
                 *    <br/>
                 *    <br/>
                 #<a href="/user/best_reviewers?country=CA&amp;duration=a">10 best reviewers</a>
                 *    <br/>
                 #<a href="/user/top_reviewers?country=CA&amp;duration=a">36 top reviewers</a>
                 *
                 *  </div>
                 * </div>
                 * </div>
                 *
                 * 196 ratings
                 * (3.29 avg)
                 * 115 reviews
                 *
                 #10 best reviewers
                 #36 top reviewers
                 */
                //numRatings
                //numReviews
                //avgRating
                //badges
                //EXCEPT badges, this code works for both user and author pages
                nodes = doc.SelectNodes(".//div[@class='leftContainer']//div[@class='smallText']//text()[normalize-space()]");
                if (nodes == null)
                {
                    //private profiles still have this data but in a different HTML layout. If this 'different layout' is found
                    //then we assume this is a private profile.
                    User.IsPrivate = true;

                    CrawlUserProfile.form.appendLineToLog(string.Format("User {0} profile is deemed private."
                                                                        , User.userIdString
                                                                        ));
                    return;
                }
                if (nodes != null)
                {
                    int badgeNum = 1;

                    //to get numerical values and badges
                    foreach (var n in nodes)
                    {
                        string inner = n.InnerText.Trim();;

                        //if text begins with "//" then it's script text and we want to skip.
                        if (inner.StartsWith(@"//"))
                        {
                            continue;
                        }

                        //if text begins with "#" then it's a badge
                        if (inner.StartsWith("#"))// && n.NextSibling != null)
                        {
                            switch (badgeNum)
                            {
                            case 1:
                                User.badge1 = inner;
                                break;

                            case 2:
                                User.badge2 = inner;
                                break;

                            case 3:
                                User.badge3 = inner;
                                break;

                            case 4:
                                User.badge4 = inner;
                                break;

                            case 5:
                                User.badge5 = inner;
                                break;
                            }
                            badgeNum++;
                        }
                        else
                        {
                            decimal d = CrawlUtil.extractNumberFromString(inner);
                            if (d != -1)
                            {
                                if (inner.Contains("avg")) //author pages use "avg rating" which matches both "rating" and "avg" - so we match "avg" first
                                {
                                    User.avgRating = d;
                                }
                                else if (inner.Contains("rating"))
                                {
                                    User.numRatings = Convert.ToInt32(d);
                                }
                                else if (inner.Contains("review"))
                                {
                                    User.numReviews = Convert.ToInt32(d);
                                }
                            }
                        }
                    }
                }

                //to get the 'badges' for authors
                if (User.IsAuthor == true)
                {
                    nodes = doc.SelectNodes(".//div[@class='leftContainer']//div[@id='topListSection']//text()[normalize-space()]");
                    if (nodes != null)
                    {
                        int badgeNum = 1;
                        foreach (var n in nodes)
                        {
                            string inner = n.InnerText.Trim();

                            if (inner.StartsWith("#"))// && n.NextSibling != null)
                            {
                                switch (badgeNum)
                                {
                                case 1:
                                    User.badge1 = inner;
                                    break;

                                case 2:
                                    User.badge2 = inner;
                                    break;

                                case 3:
                                    User.badge3 = inner;
                                    break;

                                case 4:
                                    User.badge4 = inner;
                                    break;

                                case 5:
                                    User.badge5 = inner;
                                    break;
                                }
                                badgeNum++;
                            }
                        }
                    }
                }

                //<a href="/friend/user/104320-erin-beck" rel="nofollow">Erin’s Friends (4)</a>
                //<a rel="nofollow" href="/friend/user/3094317-tori-smexybooks-smexys-sidekick">(Tori-Smexybooks)’s Friends (1,505)</a>
                //numFriends
                //Works for both users and authors
                node = doc.SelectSingleNode(".//a[@href='/friend/user/" + User.userIdString + "']/text()");
                if (node != null)
                {
                    decimal d = CrawlUtil.extractNumberFromString(node.InnerText);

                    if (d != -1)
                    {
                        User.numFriends = Convert.ToInt32(d);
                    }
                }

                /* each friend's html
                 * <div class="left">
                 * <div class="friendName">
                 *  <a href="/user/show/355607-t-k-kenyon" rel="acquaintance">T.K. Kenyon</a>
                 * </div>
                 * 819 books
                 * <span class="greyText">|</span>
                 * 2,392 friends
                 * </div>
                 */
                //8 friends' summary details
                if (CrawlUserProfile.addFriends)
                {
                    nodes = doc.SelectNodes(".//div[@class='left']");

                    if (nodes != null)
                    {
                        foreach (var n in nodes) //each friend
                        {
                            Friendship f = new Friendship();
                            f.userId = this.User.id;

                            node = n.SelectSingleNode(".//div[@class='friendName']/a");
                            if (node != null)
                            {
                                string attr = node.GetAttributeValue("href", "");
                                f.friendIdString = CrawlUtil.extractUserIdStringFromUrl(attr);

                                f.rel = node.GetAttributeValue("rel", "");
                            }
                            else
                            {
                                continue;
                            }

                            node = n.SelectSingleNode("./text()[normalize-space()]"); //number of books
                            if (node != null)
                            {
                                int numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(node.InnerText));

                                if (numBooks != -1)
                                {
                                    f.friendNumBooks = numBooks;
                                }

                                node = node.SelectSingleNode("following-sibling::text()"); //number of friends
                                if (node != null)
                                {
                                    int numFriends = Convert.ToInt32(CrawlUtil.extractNumberFromString(node.InnerText));

                                    if (numFriends != -1)
                                    {
                                        f.friendNumFriends = numFriends;
                                    }
                                }
                            }

                            try
                            {
                                context.Friendships.AddObject(f);
                                context.SaveChanges();
                            }
                            catch (Exception)
                            {
                                context.Friendships.Detach(f);
                            }
                        }
                    }
                }

                //User.numFollowers
                //Users (non-authors) <a class="actionLink" rel="nofollow" href="/user/3094317-tori-smexybooks-smexys-sidekick/followers">319 people are following (Tori-Smexybooks)</a>
                //authors <a href="/author_followings?id=6458332&amp;method=get">Lucas Lyndes’s Followers (8)</a>

                if (User.IsAuthor == true)
                {
                    node = doc.SelectSingleNode(".//a[starts-with(@href, '/author_followings?')]/text()");
                }
                else
                {
                    node = doc.SelectSingleNode(".//a[@href='/user/" + User.userIdString + "/followers']/text()");
                }
                if (node != null)
                {
                    decimal d = CrawlUtil.extractNumberFromString(node.InnerText);

                    if (d != -1)
                    {
                        User.numFollowers = Convert.ToInt32(d);
                    }
                }


                //User.numUserIsFollowing - see [done]goodreads-numUserIsFollowing logic
                //N/A for authors
                if (User.IsAuthor == true)
                {
                    User.numUserIsFollowing = null;
                }
                else
                {
                    nodes = doc.SelectNodes(".//a[contains(text(),'is Following')]/../../../div[@class='bigBoxBody']//div/a");
                    if (nodes != null)
                    {
                        User.numUserIsFollowing = nodes.Count;
                    }
                }


                //see [done]goodreads-quiz logic.html
                //User.quizNumCorrect
                //User.quizNumQuestions
                //User.quizRank
                //User.quizRankOutOf
                //N/A for authors
                if (User.IsAuthor == true)
                {
                    User.quizNumCorrect   = null;
                    User.quizNumQuestions = null;
                    User.quizRank         = null;
                    User.quizRankOutOf    = null;
                }
                else
                {
                    nodes = doc.SelectNodes(".//div[@id='myQuizStats']//div[@class='infoBoxRowTitle' or @class='infoBoxRowItem']");
                    if (nodes != null)
                    {
                        string s = "";
                        foreach (var n in nodes)
                        {
                            if (n.InnerText.Contains("questions answered"))
                            {
                                s = n.NextSibling.NextSibling.InnerText;
                                User.quizNumQuestions = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                //CrawlUserProfile.form.appendLineToLog("answered: " + CrawlUtil.extractNumberFromString(s) + " (" + s + ")");
                            }

                            else if (n.InnerText.Contains("correct"))
                            {
                                s = n.NextSibling.NextSibling.InnerText;
                                User.quizNumCorrect = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                //CrawlUserProfile.form.appendLineToLog("correct: " + CrawlUtil.extractNumberFromString(s) + " (" + s + ")");
                            }

                            else if (n.InnerText.Contains("ranking"))
                            {
                                s             = n.NextSibling.NextSibling.InnerText;
                                User.quizRank = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                //CrawlUserProfile.form.appendLineToLog("ranking: " + CrawlUtil.extractNumberFromString(s) + " (" + s + ")");

                                s = s.Substring(s.IndexOf("out of"));
                                User.quizRankOutOf = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                //CrawlUserProfile.form.appendLineToLog("of: " + CrawlUtil.extractNumberFromString(s));
                            }
                        }
                    }
                }


                //User.ReadingChallenges - see [done]goodreads-challenge logic.html
                //CHECK if working for authors
                if (CrawlUserProfile.addChallenges)
                {
                    nodes = doc.SelectNodes(".//div[@class='challengePic']");

                    if (nodes != null)
                    {
                        foreach (var n in nodes)
                        {
                            var challengePic = n.SelectSingleNode(".//img");

                            if (challengePic != null)
                            {
                                string challenge = challengePic.GetAttributeValue("alt", "unknown");

                                if (challenge != "unknown")
                                {
                                    ReadingChallenge rc = CrawlUtil.createOrGetChallenge(context, User.id, challenge);
                                    try
                                    {
                                        var stats = n.NextSibling.NextSibling.SelectSingleNode(".//div[@class='bookMeta progressStats']");

                                        if (stats != null)
                                        {
                                            string s = stats.InnerText;
                                            rc.numBooksRead = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));

                                            s = s.Substring(s.IndexOf(" of "));
                                            rc.numBooksTarget = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                        }
                                        else
                                        {
                                            stats = n.NextSibling.NextSibling.SelectSingleNode(".//a[@class='challengeBooksRead']");

                                            if (stats != null)
                                            {
                                                string s = stats.InnerText;
                                                rc.numBooksRead = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));

                                                s = s.Substring(s.IndexOf(" of "));
                                                rc.numBooksTarget = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                                            }
                                        }

                                        rc.lastUpdated = DateTime.Now;

                                        context.SaveChanges();
                                    }
                                    catch (Exception ex)
                                    {
                                        context.ReadingChallenges.Detach(rc);
                                    }
                                }
                            }
                        }
                    }
                }

                /*
                 * <a href="/librarian/user/255233-sonic" rel="nofollow">Goodreads librarian</a>
                 */
                //User.isLibrarian
                node = doc.SelectSingleNode(".//a[@href='/librarian/user/" + User.userIdString + "']");

                if (node != null && node.InnerText == "Goodreads librarian")
                {
                    User.IsLibrarian = true;
                }
                else
                {
                    User.IsLibrarian = false;
                }

                //User.Genres - see [done]goodreads-genres (fav) logic.html
                //UNCHECKED for round 5 update
                if (CrawlUserProfile.addGenres)
                {
                    nodes = doc.SelectNodes(".//h2[contains(text(),'Favorite Genres')]/../../div[@class='bigBoxBody']//a");

                    if (nodes != null)
                    {
                        foreach (var n in nodes)
                        {
                            Genre g = new Genre();
                            g.name   = n.InnerText.Trim();
                            g.userId = User.id;

                            try
                            {
                                context.AddToGenres(g);
                                context.SaveChanges();
                            }
                            catch (Exception)
                            {
                                context.Genres.Detach(g);
                            }
                        }
                    }
                }

                //User.Activities - see [done]goodreads-updates logic.html
                //UNCHECKED for round 5 update
                if (CrawlUserProfile.addActivities)
                {
                    nodes = doc.SelectNodes(".//table[@class='tableListReverse friendUpdates']/tr[@class='update' or @class='no_border']");

                    if (nodes != null)
                    {
                        foreach (var n in nodes)
                        {
                            if ("update" == n.GetAttributeValue("class", "unknown"))
                            {
                                Activity a = new Activity();
                                try
                                {
                                    a.userId = User.id;

                                    a.activityHTML = n.InnerHtml.Length > 8000 ? n.InnerHtml.Substring(0, 7999) : n.InnerHtml;

                                    var ts = n.SelectSingleNode("following-sibling::tr//a[@class='updatedTimestamp']");
                                    a.activityTimestampString = ts.InnerText;
                                    a.retrievedAt             = DateTime.Now;

                                    context.AddToActivities(a);
                                    context.SaveChanges();
                                }
                                catch (Exception ex)
                                {
                                    context.Activities.Detach(a);
                                }
                            }
                        }
                    }
                }

                //separate function/class User.numFavouriteAuthors
                //separate function/class User.Groups
                //separate function/class User.Lists
                //separate function/class User.numShelves
            }

            CrawlUserProfile.form.appendLineToLog(User.userIdString + ":: " + "details and ticked updated.");
        }
Exemple #7
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            string temp  = propertyBag.ResponseUri.OriginalString;
            string temp2 = CrawlReviewsOnUserProfile.baseUri.Substring(0, 54 + User.userIdString.Length);

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile.baseUri)
            {
                var node = doc.SelectSingleNode(".//div[@id='reviewPagination']");

                if (node != null)
                {
                    try
                    {
                        var x       = node.LastChild.PreviousSibling.PreviousSibling;
                        int maxPage = Int32.Parse(x.InnerText.Trim());

                        string uri;
                        for (int i = 2; i <= maxPage; i++)
                        {
                            uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?format=html&page=" + i + "&per_page=100&shelf=read";
                            crawler.AddStep(new Uri(uri), 0);

                            CrawlReviewsOnUserProfile.form.appendLineToLog(uri);
                        }
                    }
                    catch (Exception)
                    {
                    }
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']"))
                {
                    CrawlReviewsOnUserProfile.count++;

                    string reviewIdString = reviewNode.GetAttributeValue("id", "");

                    if (reviewIdString == "")
                    {
                        return;
                    }

                    int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString());

                    Review review = CrawlUtil.getReview(context, reviewId);

                    //create and process the REVIEW if it doesn't already exist
                    if (review == null)
                    {
                        HtmlNode node;
                        review = new Review();

                        review.id = reviewId;

                        //REVIEW.rating
                        node = reviewNode.SelectSingleNode(".//td[@class='field rating']//img");
                        if (node != null)
                        {
                            string ratingString = node.GetAttributeValue("alt", "0");
                            short  rating       = short.Parse(ratingString.Substring(0, 1));

                            review.starRating = rating;
                        }

                        //REVIEW.publishdate
                        node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span");
                        if (node != null)
                        {
                            DateTime date;
                            DateTime.TryParse(node.InnerText, out date);

                            review.publishDate = date;
                        }

                        //USER
                        review.userId       = User.id;
                        review.userIdString = User.userIdString;

                        //BOOK
                        node = reviewNode.SelectSingleNode(".//td[@class='field title']//a");
                        string bookUrl = node.GetAttributeValue("href", "");

                        int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0

                        Book book = CrawlUtil.getBook(context, bookId);

                        if (book == null)
                        {
                            book    = new Book();
                            book.id = bookId;

                            string title = node.GetAttributeValue("title", "");
                            book.title = title;

                            node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div");
                            if (node != null)
                            {
                                book.isbn = node.InnerText.Trim();
                            }

                            //AUTHOR
                            node = reviewNode.SelectSingleNode(".//td[@class='field author']//a");
                            if (node != null)
                            {
                                string authorUrl = node.GetAttributeValue("href", "");

                                int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0

                                Author author = CrawlUtil.getAuthor(context, authorId);

                                if (author == null)
                                {
                                    author    = new Author();
                                    author.id = authorId;

                                    author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim());

                                    book.Author = author;
                                }
                            }
                        }

                        review.Book = book;

                        context.SaveChanges();
                    }
                }

                CrawlReviewsOnUserProfile.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile.count + " reviews crawled");
            }
        }
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            string temp  = propertyBag.ResponseUri.OriginalString;
            string temp2 = CrawlReviewsOnUserProfile_Updated.baseUri.Substring(0, 54 + User.userIdString.Length);

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile_Updated.baseUri)
            {
                var node = doc.SelectSingleNode(".//div[@id='reviewPagination']");

                if (node != null)
                {
                    HtmlNode maxPageNode;
                    int      maxPage = 0;

                    try
                    {
                        maxPageNode = node.LastChild.PreviousSibling.PreviousSibling;
                        maxPage     = Int32.Parse(maxPageNode.InnerText.Trim());
                    }
                    catch (Exception)
                    {
                        CrawlReviewsOnUserProfile_Updated.form.appendLineToLog("Error getting maxPage on " + propertyBag.ResponseUri.OriginalString);
                        return;
                    }

                    //get new reviews since last crawl?
                    int pagesToCrawl = 0;
                    if (CrawlReviewsOnUserProfile_Updated.getReviewsSinceLastCrawl)
                    {
                        pagesToCrawl = maxPage - (User.Reviews.Count / 20); //int division results in truncation
                        if (pagesToCrawl < 1)
                        {
                            return;
                        }

                        /**** TEMP to get pages 30 and above (for users with more than 600 reviews (after getting up to 600 only on a previous run))****/
                        //for (int i = 30; i <= maxPage; i++)
                        //{
                        //    String s= "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                        //    crawler.AddStep(new Uri(s), 0);

                        //    CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(s);
                        //}
                        //return;

                        /*** Old logic pre 2015 11 30 ***
                         * int startPage = (User.Reviews.Count / 20)+1;
                         * string uri;
                         * for (int i = startPage; i <= maxPage; i++)
                         * {
                         *  uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                         *  crawler.AddStep(new Uri(uri), 0);
                         *
                         *  CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri);
                         * }
                         * return;
                         * *************/
                    }
                    else //crawl every page
                    {
                        pagesToCrawl = maxPage;
                    }

                    string uri;
                    for (int i = 2; i <= pagesToCrawl; i++)
                    {
                        //http://www.goodreads.com/review/list/1-otis-chandler?page=3&print=true&shelf=read
                        uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?page=" + i + "&print=true&shelf=read";
                        crawler.AddStep(new Uri(uri), 0);

                        CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri);
                    }
                    //continue to with crawl on page 1 unless endPage is 0 (i.e. no pages need to be crawled)


                    //2015-11-30: getting X latest pages is now redudant since reviews are now always sorted by date added ascending.
                    //feature removed for 2015 update 5 crawl

                    //get reviews from specified pages; or latest X pages
                    //get user's latest X pages of reviews if user has more than (maxPage * 20) reviews; X determined by getHowManyLatestPages
                    //if (maxPage > CrawlReviewsOnUserProfile_Updated.maxPage)
                    //{
                    //    if (CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages > 0)
                    //    {
                    //        int numLatestPages = Math.Min(CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages, maxPage - CrawlReviewsOnUserProfile_Updated.maxPage);
                    //        for (int i = 0; i < numLatestPages; i++)
                    //        {
                    //            string uriLatest = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + (maxPage - i) + "&print=true&shelf=read&sort=date_added&view=reviews";
                    //            crawler.AddStep(new Uri(uriLatest), 0);

                    //            CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uriLatest);
                    //        }
                    //    }

                    //    maxPage = CrawlReviewsOnUserProfile_Updated.maxPage;
                    //}

                    //string u;
                    //for (int i = CrawlReviewsOnUserProfile_Updated.minPage; i <= maxPage; i++)
                    //{
                    //    u = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                    //    crawler.AddStep(new Uri(u), 0);

                    //    CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(u);
                    //}

                    //if don't want to include page 1 then don't crawl after adding other pages to crawl
                    //if (CrawlReviewsOnUserProfile_Updated.minPage > 1)
                    //{
                    //    return;
                    //}
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']"))
                {
                    string reviewIdString = reviewNode.GetAttributeValue("id", "");

                    if (reviewIdString == "")
                    {
                        return;
                    }

                    int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString());

                    //Review review = CrawlUtil.createOrGetReview(context, reviewId);
                    Review review = CrawlUtil.getReview(context, reviewId);

                    if (review == null) //review is new
                    {
                        review    = new Review();
                        review.id = reviewId;

                        context.Reviews.AddObject(review);
                    }
                    else //review already exists
                    {
                        continue;
                    }

                    HtmlNode node;

                    //REVIEW.rating

                    /*<td class="field rating">
                     *  <label>Reb's rating</label>
                     *  <div class="value">
                     *      <a class=" staticStars stars_4" title="really liked it">4 of 5 stars</a>
                     *  </div>
                     * </td>*/
                    node = reviewNode.SelectSingleNode(".//td[@class='field rating']//a");
                    if (node != null)
                    {
                        string ratingClassString = node.GetAttributeValue("class", "0");
                        short  rating            = CrawlUtil.getRatingFromClassString(ratingClassString);

                        review.starRating = rating;
                    }

                    //REVIEW.publishdate
                    node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span");
                    if (node != null)
                    {
                        DateTime date;
                        DateTime.TryParse(node.InnerText, out date);

                        review.publishDate = date;
                    }

                    //USER
                    review.userId       = User.id;
                    review.userIdString = User.userIdString;

                    //BOOK
                    node = reviewNode.SelectSingleNode(".//td[@class='field title']//a");
                    string bookUrl = node.GetAttributeValue("href", "");

                    int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0

                    Book book = CrawlUtil.getBook(context, bookId);

                    if (book == null)
                    {
                        book    = new Book();
                        book.id = bookId;

                        string title = node.GetAttributeValue("title", "");
                        book.title = title;

                        node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div");
                        if (node != null)
                        {
                            book.isbn = node.InnerText.Trim();
                        }

                        //AUTHOR
                        node = reviewNode.SelectSingleNode(".//td[@class='field author']//a");
                        if (node != null)
                        {
                            string authorUrl = node.GetAttributeValue("href", "");

                            int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0

                            Author author = CrawlUtil.getAuthor(context, authorId);

                            if (author == null)
                            {
                                author    = new Author();
                                author.id = authorId;

                                author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim());

                                book.Author = author;
                            }
                        }
                    }

                    review.Book = book;

                    context.SaveChanges();


                    CrawlReviewsOnUserProfile_Updated.count++;
                }

                CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile_Updated.count + " reviews crawled");
            }
        }