예제 #1
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlListAndVotes.baseUri)
            {
/*
 *
 * <div>
 * <span class="previous_page disabled">&laquo; previous</span>
 * <em class="current">1</em>
 * <a rel="next" href="/list/user_votes/1045275-natasha?page=2">2</a>
 * <a href="/list/user_votes/1045275-natasha?page=3">3</a>
 * <a href="/list/user_votes/1045275-natasha?page=4">4</a>
 * <a href="/list/user_votes/1045275-natasha?page=5">5</a>
 * <a class="next_page" rel="next" href="/list/user_votes/1045275-natasha?page=2">next &raquo;</a>
 * </div>
 */
                var node = doc.SelectSingleNode(".//a[@class='next_page' and @rel='next']");

                if (node != null)
                {
                    try
                    {
                        var x       = node.PreviousSibling.PreviousSibling;
                        int maxPage = Int32.Parse(x.InnerText.Trim());

                        string uri;
                        for (int i = 2; i <= maxPage; i++)
                        {
                            uri = "http://www.goodreads.com/list/user_votes/" + User.userIdString + "?page=" + i;
                            crawler.AddStep(new Uri(uri), 0);

                            CrawlListAndVotes.form.appendLineToLog(uri);
                        }
                    }
                    catch (Exception ex)
                    {
                        CrawlListAndVotes.form.appendLineToLog(ex.Message);
                    }
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var listNode in doc.SelectNodes(".//div[@class='cell']"))
                {
                    List   l         = null;
                    string title     = null;
                    var    titleNode = listNode.SelectSingleNode(".//a[@class='listTitle']");
                    if (titleNode != null)
                    {
                        title = titleNode.InnerText.Trim();
                    }

                    if (title != null)
                    {
                        l = CrawlUtil.createOrGetList(context, title);
                    }
                    else
                    {
                        continue;
                    }

/*
 *  296 books
 *  &mdash;
 *  994 voters
 */
                    var statsNode = listNode.SelectSingleNode(".//div[@class='listFullDetails']");
                    if (statsNode != null)
                    {
                        string s = statsNode.InnerText.Replace("\n", "").Trim();
                        l.numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));

                        s           = s.Substring(s.IndexOf("books"));
                        l.numVoters = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                    }

                    User u = CrawlUtil.getUser(context, User.id);
                    u.Lists.Add(l);

                    try
                    {
                        context.SaveChanges();
                        CrawlListAndVotes.count++;
                    }
                    catch (Exception ex)
                    {
                        User.Lists.Remove(l);
                        //this just prints out to check an inner exception which is a dupe PK error
                        //CrawlListAndVotes.form.appendLineToLog(ex.Message);
                    }
                }

                CrawlListAndVotes.form.appendLineToLog(User.userIdString + ":: " + CrawlListAndVotes.count + " lists added");
            }
        }