protected override void Handle(Page page)
        {
            var results = new List <TiebaFloorReply>();

            if (Regex.IsMatch(page.Request.RequestUri.AbsoluteUri, "tieba.baidu.com/t/p/[0-9]*"))
            {
                var totalFloorReplyElements = page.Selectable().SelectList(Selectors.XPath(".//ul[@class='pb_lzl_content j_floor_panel']")).Nodes();
                foreach (var floorReplyElement in totalFloorReplyElements)
                {
                    TiebaFloorReply floorReply = new TiebaFloorReply();
                    JObject         jObject    = JObject.Parse(floorReplyElement.Select(Selectors.XPath("@data-info")).GetValue());
                    floorReply.Key          = jObject["pid"].ToString();
                    floorReply.UserName     = jObject["un"].ToString();
                    floorReply.UserNickName = floorReplyElement.Select(Selectors.XPath(".//a[@class='user_name ']")).GetValue(ValueOption.InnerText);
                    floorReply.ThreadID     = Regex.Match(page.Selectable().Select(Selectors.XPath(".//div[@class='pb_lzl_header_bar']//a//@href")).GetValue(), "[0-9]+").Value;
                    floorReply.FloorID      = Regex.Match(page.Request.Url, "[0-9]+").Value;
                    floorReply.Body         = floorReplyElement.Select(Selectors.XPath(".//a[@class='lzl_content j_lzl_content ']")).GetValue(ValueOption.InnerHtml);
                    floorReply.ReplyTime    = TiebaTime.Parse(floorReplyElement.Select(Selectors.XPath(".//div[@class='left ']//div//p")).GetValue().Trim());
                    floorReply.UpdateTime   = DateTime.Now;
                    results.Add(floorReply);
                }
            }
            //将数据添加进去,使得数据存储类可以拿到数据
            if (results.Count > 0)
            {
                page.AddResultItem("TiebaFloorReply", results);
            }
        }
示例#2
0
        protected override void Handle(Page page)
        {
            var results = new List <TiebaFloor>();

            if (Regex.IsMatch(page.Request.RequestUri.AbsoluteUri, "tieba.baidu.com/p/[0-9]*"))
            {
                var totalFloorElements = page.Selectable().SelectList(Selectors.XPath(".//div[@class='l_post l_post_bright j_l_post clearfix  ']")).Nodes();
                foreach (var floorElement in totalFloorElements)
                {
                    //遍历获取楼层数据
                    TiebaFloor tiebaFloor = new TiebaFloor();
                    var        json       = floorElement.Select(Selectors.XPath("@data-field")).GetValue();
                    JObject    jObject    = JObject.Parse(HttpUtility.HtmlDecode(json));
                    tiebaFloor.PostIndex = Convert.ToInt32(jObject["content"]["post_no"].ToString());
                    //如果楼层索引为0,那么这楼就是帖子正文,不用加到帖子楼层里面
                    if (tiebaFloor.PostIndex == 0)
                    {
                        continue;
                    }
                    tiebaFloor.Key          = jObject["content"]["post_id"].ToString();
                    tiebaFloor.ForumID      = jObject["content"]["forum_id"].ToString();
                    tiebaFloor.ThreadID     = jObject["content"]["thread_id"].ToString();
                    tiebaFloor.FloorBody    = jObject["content"]["content"].ToString();
                    tiebaFloor.UserName     = jObject["author"]["user_name"].ToString();
                    tiebaFloor.UserNickName = jObject["author"]["user_nickname"].ToString();
                    tiebaFloor.UserID       = jObject["author"]["user_id"].ToString();
                    string time = floorElement.SelectList(Selectors.XPath(".//span[@class='tail-info']")).Nodes().Last().GetValue();
                    tiebaFloor.CommentTime = TiebaTime.Parse(time);
                    tiebaFloor.UpdateTime  = DateTime.Now;
                    //获取到数据添加到集合中
                    results.Add(tiebaFloor);
                }
                //获取帖子总页数
                int    total   = Convert.ToInt32(page.Selectable().SelectList(Selectors.XPath(".//li[@class='l_reply_num']//span[@class='red']")).Nodes().Last().GetValue());
                string currStr = page.Selectable().SelectList(Selectors.XPath(".//span[@class='tP']"))?.GetValue();
                //当前页索引
                int currIndex = 1;
                if (currStr != null)
                {
                    currIndex = Convert.ToInt32(currStr);
                }
                if (currIndex < total)
                {
                    //往后翻页(此处不循环添加主要考虑帖子的数量可能很多,待爬取链接可能会撑爆内存)
                    page.AddTargetRequest(new Request(page.Request.RequestUri.SetParameter("pn", (currIndex + 1).ToString()).AbsoluteUri));
                }
            }
            //将数据添加进去,使得数据存储类可以拿到数据
            if (results.Count > 0)
            {
                page.AddResultItem("TiebaFloor", results);
            }
        }
示例#3
0
        protected override void Handle(Page page)
        {
            var results = new List <TiebaPost>();

            //如果是帖子列表页面,就获取所有帖子
            if (page.Request.RequestUri.AbsoluteUri.Contains("tieba.baidu.com/f?"))
            {
                var totalPostsElements = page.Selectable().SelectList(Selectors.XPath(".//li[@class='tl_shadow tl_shadow_new ']")).Nodes();
                foreach (var postElement in totalPostsElements)
                {
                    TiebaPost tiebaPost = new TiebaPost();
                    //tiebaPost.Title = classElement.Select(Select.Regex("(?<=title=\")[^ (\" target=\"_blank\")]*(?=(\" target = \"_blank\" class=\"j_th_tit \"))")).GetValue();
                    //tiebaPost.ReplyNum = Convert.ToInt32(classElement.Select(Selectors.Regex("(?<=(title=\"回复\">))[0-9]*")).GetValue());
                    //tiebaPost.UserName = classElement.Select(Selectors.Regex("(?<=(title=\"主题作者: ))[^ (\"\\s)]*")).GetValue();

                    tiebaPost.Key      = postElement.Select(Selectors.XPath(".//a[@class='j_common ti_item ']/@data-tid")).GetValue();
                    tiebaPost.Title    = postElement.Select(Selectors.XPath(".//div[@class='ti_title']")).GetValue(ValueOption.InnerText).Trim();
                    tiebaPost.ReplyNum = Convert.ToInt32(postElement.Select(Selectors.XPath(".//div[@class='ti_func_btn btn_reply']")).GetValue(ValueOption.InnerText));
                    tiebaPost.UserName = postElement.Select(Selectors.XPath(".//div[@class='ti_author_icons  clearfix']//span")).GetValue(ValueOption.InnerText).Trim();
                    string time = postElement.Select(Selectors.XPath(".//span[@class='ti_time']")).GetValue();
                    tiebaPost.StartTime  = DateTime.Parse(time);
                    tiebaPost.UpdateTime = DateTime.Now;
                    //获取到数据添加到集合中
                    results.Add(tiebaPost);
                    //在列表中获取到帖子,把帖子详情链接添加到队列中
                    page.AddTargetRequest(new Request($"https://tieba.baidu.com/p/{tiebaPost.Key}"));
                }
            }
            //否则如果是帖子详情页,则更新帖子数据(相比较列表页,可以更新一些信息)
            else if (Regex.IsMatch(page.Request.RequestUri.AbsoluteUri, "tieba.baidu.com/p/[0-9]*"))
            {
                TiebaPost tiebaPost = new TiebaPost();
                try
                {
                    tiebaPost.Title = page.Selectable().Select(Selectors.XPath(".//h1")).GetValue().Trim();
                }
                catch (Exception)
                {
                    tiebaPost.Title = page.Selectable().Select(Selectors.XPath(".//h3")).GetValue().Trim();
                }
                tiebaPost.ReplyNum = Convert.ToInt32(page.Selectable().SelectList(Selectors.XPath(".//li[@class='l_reply_num']//span")).Nodes().First().GetValue());

                var     postElements = page.Selectable().SelectList(Selectors.XPath(".//div[@id='j_p_postlist']//div")).Nodes();
                var     postElement  = page.Selectable().SelectList(Selectors.XPath(".//div[@id='j_p_postlist']//div")).Nodes().First();
                var     json         = postElement.Select(Selectors.XPath("@data-field")).GetValue();
                JObject jObject      = JObject.Parse(HttpUtility.HtmlDecode(json));
                tiebaPost.Key = jObject["content"]["post_id"].ToString();
                if (jObject["content"].Contains("content"))
                {
                    tiebaPost.Body = jObject["content"]["content"].ToString();
                }
                tiebaPost.UserName     = jObject["author"]["user_name"].ToString();
                tiebaPost.UserNickName = jObject["author"]["user_nickname"].ToString();
                tiebaPost.UserID       = jObject["author"]["user_id"].ToString();
                string time;
                if (jObject["content"]["date"] != null)
                {
                    time = jObject["content"]["date"].ToString();
                }
                else
                {
                    var ls = postElement.Select(Selectors.XPath("//div[@class='post-tail-wrap']")).XPath(".//span").Nodes();
                    if (ls == null)
                    {
                        Console.WriteLine();
                    }
                    time = ls.Last().GetValue();
                }
                tiebaPost.StartTime  = TiebaTime.Parse(time);
                tiebaPost.UpdateTime = DateTime.Now;
                //获取到数据添加到集合中
                results.Add(tiebaPost);
            }
            //将数据添加进去,使得数据存储类可以拿到数据
            if (results.Count > 0)
            {
                page.AddResultItem("TiebaPost", results);
            }
        }