/// <summary> /// 给定根节点,填充相似转发微博的信息 /// </summary> /// <param name="currentPage">微博所在页面序号</param> /// <param name="i">微博在所在页面中的流水号</param> /// <param name="similarFeedDiv">包含相似微博的div标记</param> /// <param name="originalAuthor">转发微博的原作者</param> /// <param name="feedContent">转发微博的内容</param> /// <returns>返回填充好的一个Feed实例</returns> private Feed GetSimilarFeed(int currentPage, int i, TagNode similarFeedDiv, string originalAuthor, string feedContent) { Feed feed = new Feed(); serialNumber++; feed.Page = currentPage; feed.Number = serialNumber; feed.ReFeedOrNot = true; #region 获取转发相似微博的作者 NodeList feedAuthorNodeList = similarFeedDiv.Children.ExtractAllNodesThatMatch(feedAuthorFilter, true); if (feedAuthorNodeList.Size() == 1) { ATag feedAuthorTag = (ATag)feedAuthorNodeList[0].Children[1]; string author = feedAuthorTag.GetAttribute("TITLE"); feed.Author = author; //如果存在,则获取该作者的备注名 INode remarkNameNode = feedAuthorTag.NextSibling; if (remarkNameNode.GetType().Equals(typeof(Span))) { string remarkName = ((Span)remarkNameNode).StringText; //去掉前后括号 remarkName = remarkName.Substring(1, remarkName.Length - 2); feed.RemarkName = remarkName; } } else { Console.WriteLine("第" + i + "条微博中,判断转发类似微博的作者的标准出错!"); } #endregion #region 获取转发相似微博的转发理由 NodeList reFeedReasonNodeList = similarFeedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true); if (reFeedReasonNodeList.Size() == 1) { feed.ReFeedReason = GetContentFromChildren(feed, reFeedReasonNodeList[0], false); } else { Console.WriteLine("第" + i + "条微博中,判断转发类似微博的转发理由的标准出错!"); } #endregion //因为是转发相似微博,所以原微博的作者和内容就通过参数传入 feed.OriginalAuthor = originalAuthor; feed.Content = feedContent; //获取转发类似微博的发送地点信息 feed.Location = GetLocationInfo(i, similarFeedDiv); //获取转发类似微博赞数 feed.LikeCount = GetFeedLikeInfo(i, similarFeedDiv); //获取转发类似微博的转发数 feed.ReFeedCount = GetFeedForwardCount(i, similarFeedDiv); //获取转发类似微博的评论数 feed.CommentCount = GetFeedCommentCount(i, similarFeedDiv); //获取转发类似微博的发送时间 feed.Time = GetFeedTimeInfo(i, similarFeedDiv); //获取转发类似微博的发送方式 feed.Device = GetFeedSendTypeInfo(i, similarFeedDiv); return feed; }
/// <summary> /// 辅助函数:以给定的HTML节点为根节点,把其子节点均作为微博内容提取出来 /// </summary> /// <param name="feed">保存微博内容的Feed实例</param> /// <param name="node">作为根节点的HTML节点</param> /// <param name="hasEmTag">子节点中是否含有em标签(只有转发微博中含有em标签),若有em标签,hasEmTag为true,则start初始为false;反之为true</param> /// <returns>返回微博内容字符串</returns> private static string GetContentFromChildren(Feed feed, INode node, bool hasEmTag) { string content = ""; bool start = !hasEmTag; for (int i = 0; i < node.Children.Size(); i++) { Type t = node.Children[i].GetType(); if (start) { if (t.Equals(typeof(TextNode))) { string str = ((TextNode)node.Children[i]).ToPlainTextString(); //遇到“//”说明微博内容提取完成;同时,还要提取“//”之后的一系列转发者 if (str.Length >= 2 && str.Substring(str.Length - 2).Equals("//")) { //去掉“//” str = str.Substring(0, str.Length - 2); content += str; //string reFeedFrom = ((ATag)node.Children[i + 1]).StringText; //if (reFeedFrom[0].Equals('@')) //{ // //去掉“@” // reFeedFrom = reFeedFrom.Substring(1, reFeedFrom.Length - 1); //} //获取转发链 string reFeedFrom = ""; for (int j = i + 1; j < node.Children.Size(); j++) { Type t2 = node.Children[j].GetType(); if (t2.Equals(typeof(ATag)) && ((ATag)node.Children[j]).Attributes.ContainsKey("USERCARD")) { string oneReFeeder = ((ATag)node.Children[j]).StringText; if (oneReFeeder[0].Equals('@')) { //去掉“@” oneReFeeder = oneReFeeder.Substring(1, oneReFeeder.Length - 1); reFeedFrom = reFeedFrom.Insert(0, oneReFeeder + " "); } else { Console.WriteLine("获取转发链时出现错误!此前的转发链为" + reFeedFrom); } } } //最后,把reFeedFrom赋给feed.ReFeedFrom feed.ReFeedFrom = reFeedFrom; break; } content += str; continue; } if (t.Equals(typeof(ATag))) { ATag aTagNode = (ATag)node.Children[i]; //某些情况下,链接标记中不仅仅含有文本节点,还有span标记(以后说不定还会碰到跟奇葩的……),所以提取aTagNode的孩子节点中所有文本节点信息 NodeClassFilter textNodeFilter = new NodeClassFilter(typeof(TextNode)); NodeList nodeList = aTagNode.Children.ExtractAllNodesThatMatch(textNodeFilter, true); for (int j = 0; j < nodeList.Size(); j++) { content += ((TextNode)nodeList[j]).ToPlainTextString(); } continue; } if (t.Equals(typeof(TagNode))) { content += ((TagNode)node.Children[i]).ToPlainTextString(); continue; } if (t.Equals(typeof(ImageTag))) { content += ((ImageTag)node.Children[i]).GetAttribute("TITLE"); continue; } } else { if (t.Equals(typeof(TagNode)) && (((TagNode)(node.Children[i])).TagName.Equals("EM"))) { start = true; } } } //某些情况下最先/后数个字符竟然会是空格和换行符(ASCII码10),瞎了…… char[] shouldRemove = { ' ', (char)10, '\r', '\n' }; content = content.TrimStart(shouldRemove); content = content.TrimEnd(shouldRemove); return content; }
/// <summary> /// 给定根结点,填充转发微博相关的各种信息 /// </summary> /// <param name="i">该微博在所在页面中的流水号</param> /// <param name="feed">保存该微博的Feed实例</param> /// <param name="reFeedDiv">包含转发微博的div标记</param> /// <param name="feedDiv">包含原微博的div标记</param> private void GetReFeedInfo(int i, Feed feed, TagNode reFeedDiv, TagNode feedDiv) { //标记ReFeedOrNot为true,表明是转发微博 feed.ReFeedOrNot = true; //标识是否出现“转发微博已被删除”的情况 bool reFeedIsDeleted = false; #region 获取原微博作者 NodeList reFeedOriginalAuthorNodeList = reFeedDiv.Children.ExtractAllNodesThatMatch(reFeedAuthorFilter, true); if (reFeedOriginalAuthorNodeList.Size() == 1) { INode reFeedOriginalAuthorNode = reFeedOriginalAuthorNodeList[0]; //由于包含原微博作者的链接标记与得到的子div相对位置不定(某些情况下可能会有空文本标记,很奇怪= =),所以采用遍历判断标记类型的办法 for (int j = 0; j < reFeedOriginalAuthorNode.Children.Size(); j++) { INode reFeedOriginalAuthorCandidate = reFeedOriginalAuthorNode.Children[j]; if (reFeedOriginalAuthorCandidate.GetType().Equals(typeof(ATag))) { feed.OriginalAuthor = ((ATag)reFeedOriginalAuthorCandidate).GetAttribute("TITLE"); break; } } } else { NodeList deletedFeedList = reFeedDiv.Children.ExtractAllNodesThatMatch(refeedDeletedFilter1, true); if (deletedFeedList.Size() > 0) { reFeedIsDeleted = true; feed.OriginalAuthor = "Unknown"; } else { Console.WriteLine("第" + i + "条微博中,判断转发微博作者的标准出错!"); } } #endregion #region 获取原微博内容 NodeList reFeedContentNodeList = reFeedDiv.Children.ExtractAllNodesThatMatch(reFeedContentFilter, true); if (reFeedContentNodeList.Size() == 1) { //不清楚em是什么类型的节点,所以直接传递reFeedContentNodeList.Children给函数,让函数对其中每个元素进行遍历处理 //在个人主页中,好像又没有em节点了= =瞎了…… feed.Content = GetContentFromChildren(feed, reFeedContentNodeList[0], false); } else { if (reFeedIsDeleted) { feed.Content = "微博已删除"; } else { Console.WriteLine("第" + i + "条微博中,判断转发微博内容的标准出错!"); } } #endregion #region 获取本条微博内容作为转发理由 NodeList reFeedReasonNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true); //注意:如果含有“还有X条对原微博的转发”的内容,那么此处reFeedReasonNodeList的数量应该等于(1 + similarFeedCount),但是考虑到之前已经有了多次判断相等的过程,所以此处直接调用第一个(即下标为0的元素)即可 feed.ReFeedReason = GetContentFromChildren(feed, reFeedReasonNodeList[0], false); #endregion }
/// <summary> /// 从网页版微博中获取微博信息 /// </summary> /// <param name="currentPage">爬得的微博所在页面序号</param> /// <param name="feedList">保存爬得的微博的数组</param> public void GetInfoFromHtml(int currentPage, List<Feed> feedList) { foreach (string htmlContent in htmlContentList) { Lexer lexer = new Lexer(htmlContent); Parser parser = new Parser(lexer); //获取包含每条微博的div标记列表 NodeList feedNodeList = parser.Parse(feedFilter); for (int i = 0; i < feedNodeList.Size(); i++) { serialNumber++; Feed feed = new Feed(); feed.Page = currentPage; feed.Number = serialNumber; //类似微博转发的数量 int similarFeedCount = 0; //取得第i条微博的div TagNode feedDiv = (TagNode)feedNodeList[i]; //判断是否含有“还有X条对原微博的转发” NodeList similarfeedCountNodeList = feedDiv.Children.ExtractAllNodesThatMatch(similarFeedCountFilter, true); switch (similarfeedCountNodeList.Size()) { case 1: //说明存在“还有X条对原微博的转发”的div;此处看起来此HTML解析器不认<b>标记,而把其中包含的内容作为其下一个兄弟节点= = similarFeedCount = Int32.Parse(((TextNode)(similarfeedCountNodeList[0].NextSibling)).ToPlainTextString()); break; case 0: //说明不存在“还有X条对原微博的转发” similarFeedCount = 0; break; default: Console.WriteLine("第" + i + "条微博中,判断是否含有类似微博转发的标准出错!"); break; } #region 获取微博作者 NodeList feedAuthorNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedAuthorFilter, true); //在整个一条微博的范围(即一个feedDiv)内,满足feedAuthorFilter过滤器的div节点数量应该是本条微博作者加上转发类似微博的作者(如果有的话),所以是(1 + similarFeedCount) if (feedAuthorNodeList.Size() == (1 + similarFeedCount)) { ATag feedAuthorTag = (ATag)feedAuthorNodeList[0].Children[0]; string author = feedAuthorTag.GetAttribute("TITLE"); feed.Author = author; //如果存在,则获取该作者的备注名 INode remarkNameNode = feedAuthorTag.NextSibling; if (remarkNameNode.GetType().Equals(typeof(Span))) { string remarkName = ((Span)remarkNameNode).StringText; //去掉前后括号 remarkName = remarkName.Substring(1, remarkName.Length - 2); feed.RemarkName = remarkName; } } else { //从首页爬取微博时,微博来自不同的被关注者,所以是有微博作者的;而从个人主页爬取微博时,由于所有微博作者都是该用户,所以是没有微博作者相关节点的 if (!user.NickName.Equals("")) { feed.Author = user.NickName; feed.RemarkName = user.RemarkName; } else { Console.WriteLine("第" + i + "条微博中,判断微博作者的标准出错!"); } } #endregion #region 获取转发微博 NodeList reFeedNodeList = feedDiv.Children.ExtractAllNodesThatMatch(reFeedFilter, true); //转发微博;(1 + similarFeedCount)的理由和获取微博作者时相同 if (reFeedNodeList.Size() == (1 + similarFeedCount)) { //获取转发微博的div TagNode reFeedDiv = (TagNode)reFeedNodeList[0]; //先获取本次转发微博的相关信息 GetReFeedInfo(i, feed, reFeedDiv, feedDiv); #region 考虑“还有X条对原微博的转发”的情况 if (similarFeedCount > 0) { NodeList similarFeedNodeList = feedDiv.Children.ExtractAllNodesThatMatch(similarFeedFilter, true); if (similarFeedNodeList.Size() == similarFeedCount) { for (int j = 0; j < similarFeedCount; j++) { feedList.Add(GetSimilarFeed(currentPage, i, (TagNode)similarFeedNodeList[j], feed.OriginalAuthor, feed.Content)); } } else { Console.WriteLine("第" + i + "条微博中,获取转发微博的数量出错!"); } } #endregion } else { if (reFeedNodeList.Size() == 0) { //获取本条微博内容作为微博内容 NodeList feedContentNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true); if (feedContentNodeList.Size() == 1) { feed.Content = GetContentFromChildren(feed, feedContentNodeList[0], false); #region 由于存在某些情况,转发微博被删除后更不过滤不到reFeedDiv,所以需要再次检查是否存在已删除的转发微博 NodeList deletedFeedList = feedDiv.Children.ExtractAllNodesThatMatch(refeedDeletedFilter2, true); if (deletedFeedList.Size() > 0) { feed.OriginalAuthor = "Unknown"; feed.ReFeedOrNot = true; feed.ReFeedReason = feed.Content; feed.Content = "微博已删除"; } #endregion } else { Console.WriteLine("第" + i + "条微博中,判断微博内容的标准出错!"); } } else { Console.WriteLine("第" + i + "条微博中,判断转发微博的标准出错!"); } } #endregion //获取包含微博发送地点的div feed.Location = GetLocationInfo(i, feedDiv); //获取包含微博“赞”数的标记 feed.LikeCount = GetFeedLikeInfo(i, feedDiv); //获取包含微博转发数的链接标记 feed.ReFeedCount = GetFeedForwardCount(i, feedDiv); //获取包含微博评论数的链接标记 feed.CommentCount = GetFeedCommentCount(i, feedDiv); //获取包含微博发送时间的链接标记 feed.Time = GetFeedTimeInfo(i, feedDiv); //获取包含微博发送设备的链接标记 feed.Device = GetFeedSendTypeInfo(i, feedDiv); feedList.Add(feed); } } }