protected override void Handle(Page page) { var results = new List <TiebaFloorReply>(); if (Regex.IsMatch(page.Request.RequestUri.AbsoluteUri, "tieba.baidu.com/t/p/[0-9]*")) { var totalFloorReplyElements = page.Selectable().SelectList(Selectors.XPath(".//ul[@class='pb_lzl_content j_floor_panel']")).Nodes(); foreach (var floorReplyElement in totalFloorReplyElements) { TiebaFloorReply floorReply = new TiebaFloorReply(); JObject jObject = JObject.Parse(floorReplyElement.Select(Selectors.XPath("@data-info")).GetValue()); floorReply.Key = jObject["pid"].ToString(); floorReply.UserName = jObject["un"].ToString(); floorReply.UserNickName = floorReplyElement.Select(Selectors.XPath(".//a[@class='user_name ']")).GetValue(ValueOption.InnerText); floorReply.ThreadID = Regex.Match(page.Selectable().Select(Selectors.XPath(".//div[@class='pb_lzl_header_bar']//a//@href")).GetValue(), "[0-9]+").Value; floorReply.FloorID = Regex.Match(page.Request.Url, "[0-9]+").Value; floorReply.Body = floorReplyElement.Select(Selectors.XPath(".//a[@class='lzl_content j_lzl_content ']")).GetValue(ValueOption.InnerHtml); floorReply.ReplyTime = TiebaTime.Parse(floorReplyElement.Select(Selectors.XPath(".//div[@class='left ']//div//p")).GetValue().Trim()); floorReply.UpdateTime = DateTime.Now; results.Add(floorReply); } } //将数据添加进去,使得数据存储类可以拿到数据 if (results.Count > 0) { page.AddResultItem("TiebaFloorReply", results); } }
protected override void Handle(Page page) { var results = new List <TiebaFloor>(); if (Regex.IsMatch(page.Request.RequestUri.AbsoluteUri, "tieba.baidu.com/p/[0-9]*")) { var totalFloorElements = page.Selectable().SelectList(Selectors.XPath(".//div[@class='l_post l_post_bright j_l_post clearfix ']")).Nodes(); foreach (var floorElement in totalFloorElements) { //遍历获取楼层数据 TiebaFloor tiebaFloor = new TiebaFloor(); var json = floorElement.Select(Selectors.XPath("@data-field")).GetValue(); JObject jObject = JObject.Parse(HttpUtility.HtmlDecode(json)); tiebaFloor.PostIndex = Convert.ToInt32(jObject["content"]["post_no"].ToString()); //如果楼层索引为0,那么这楼就是帖子正文,不用加到帖子楼层里面 if (tiebaFloor.PostIndex == 0) { continue; } tiebaFloor.Key = jObject["content"]["post_id"].ToString(); tiebaFloor.ForumID = jObject["content"]["forum_id"].ToString(); tiebaFloor.ThreadID = jObject["content"]["thread_id"].ToString(); tiebaFloor.FloorBody = jObject["content"]["content"].ToString(); tiebaFloor.UserName = jObject["author"]["user_name"].ToString(); tiebaFloor.UserNickName = jObject["author"]["user_nickname"].ToString(); tiebaFloor.UserID = jObject["author"]["user_id"].ToString(); string time = floorElement.SelectList(Selectors.XPath(".//span[@class='tail-info']")).Nodes().Last().GetValue(); tiebaFloor.CommentTime = TiebaTime.Parse(time); tiebaFloor.UpdateTime = DateTime.Now; //获取到数据添加到集合中 results.Add(tiebaFloor); } //获取帖子总页数 int total = Convert.ToInt32(page.Selectable().SelectList(Selectors.XPath(".//li[@class='l_reply_num']//span[@class='red']")).Nodes().Last().GetValue()); string currStr = page.Selectable().SelectList(Selectors.XPath(".//span[@class='tP']"))?.GetValue(); //当前页索引 int currIndex = 1; if (currStr != null) { currIndex = Convert.ToInt32(currStr); } if (currIndex < total) { //往后翻页(此处不循环添加主要考虑帖子的数量可能很多,待爬取链接可能会撑爆内存) page.AddTargetRequest(new Request(page.Request.RequestUri.SetParameter("pn", (currIndex + 1).ToString()).AbsoluteUri)); } } //将数据添加进去,使得数据存储类可以拿到数据 if (results.Count > 0) { page.AddResultItem("TiebaFloor", results); } }
protected override void Handle(Page page) { var results = new List <TiebaPost>(); //如果是帖子列表页面,就获取所有帖子 if (page.Request.RequestUri.AbsoluteUri.Contains("tieba.baidu.com/f?")) { var totalPostsElements = page.Selectable().SelectList(Selectors.XPath(".//li[@class='tl_shadow tl_shadow_new ']")).Nodes(); foreach (var postElement in totalPostsElements) { TiebaPost tiebaPost = new TiebaPost(); //tiebaPost.Title = classElement.Select(Select.Regex("(?<=title=\")[^ (\" target=\"_blank\")]*(?=(\" target = \"_blank\" class=\"j_th_tit \"))")).GetValue(); //tiebaPost.ReplyNum = Convert.ToInt32(classElement.Select(Selectors.Regex("(?<=(title=\"回复\">))[0-9]*")).GetValue()); //tiebaPost.UserName = classElement.Select(Selectors.Regex("(?<=(title=\"主题作者: ))[^ (\"\\s)]*")).GetValue(); tiebaPost.Key = postElement.Select(Selectors.XPath(".//a[@class='j_common ti_item ']/@data-tid")).GetValue(); tiebaPost.Title = postElement.Select(Selectors.XPath(".//div[@class='ti_title']")).GetValue(ValueOption.InnerText).Trim(); tiebaPost.ReplyNum = Convert.ToInt32(postElement.Select(Selectors.XPath(".//div[@class='ti_func_btn btn_reply']")).GetValue(ValueOption.InnerText)); tiebaPost.UserName = postElement.Select(Selectors.XPath(".//div[@class='ti_author_icons clearfix']//span")).GetValue(ValueOption.InnerText).Trim(); string time = postElement.Select(Selectors.XPath(".//span[@class='ti_time']")).GetValue(); tiebaPost.StartTime = DateTime.Parse(time); tiebaPost.UpdateTime = DateTime.Now; //获取到数据添加到集合中 results.Add(tiebaPost); //在列表中获取到帖子,把帖子详情链接添加到队列中 page.AddTargetRequest(new Request($"https://tieba.baidu.com/p/{tiebaPost.Key}")); } } //否则如果是帖子详情页,则更新帖子数据(相比较列表页,可以更新一些信息) else if (Regex.IsMatch(page.Request.RequestUri.AbsoluteUri, "tieba.baidu.com/p/[0-9]*")) { TiebaPost tiebaPost = new TiebaPost(); try { tiebaPost.Title = page.Selectable().Select(Selectors.XPath(".//h1")).GetValue().Trim(); } catch (Exception) { tiebaPost.Title = page.Selectable().Select(Selectors.XPath(".//h3")).GetValue().Trim(); } tiebaPost.ReplyNum = Convert.ToInt32(page.Selectable().SelectList(Selectors.XPath(".//li[@class='l_reply_num']//span")).Nodes().First().GetValue()); var postElements = page.Selectable().SelectList(Selectors.XPath(".//div[@id='j_p_postlist']//div")).Nodes(); var postElement = page.Selectable().SelectList(Selectors.XPath(".//div[@id='j_p_postlist']//div")).Nodes().First(); var json = postElement.Select(Selectors.XPath("@data-field")).GetValue(); JObject jObject = JObject.Parse(HttpUtility.HtmlDecode(json)); tiebaPost.Key = jObject["content"]["post_id"].ToString(); if (jObject["content"].Contains("content")) { tiebaPost.Body = jObject["content"]["content"].ToString(); } tiebaPost.UserName = jObject["author"]["user_name"].ToString(); tiebaPost.UserNickName = jObject["author"]["user_nickname"].ToString(); tiebaPost.UserID = jObject["author"]["user_id"].ToString(); string time; if (jObject["content"]["date"] != null) { time = jObject["content"]["date"].ToString(); } else { var ls = postElement.Select(Selectors.XPath("//div[@class='post-tail-wrap']")).XPath(".//span").Nodes(); if (ls == null) { Console.WriteLine(); } time = ls.Last().GetValue(); } tiebaPost.StartTime = TiebaTime.Parse(time); tiebaPost.UpdateTime = DateTime.Now; //获取到数据添加到集合中 results.Add(tiebaPost); } //将数据添加进去,使得数据存储类可以拿到数据 if (results.Count > 0) { page.AddResultItem("TiebaPost", results); } }