예제 #1
0
        /// <summary>
        /// 给定根节点,填充相似转发微博的信息
        /// </summary>
        /// <param name="currentPage">微博所在页面序号</param>
        /// <param name="i">微博在所在页面中的流水号</param>
        /// <param name="similarFeedDiv">包含相似微博的div标记</param>
        /// <param name="originalAuthor">转发微博的原作者</param>
        /// <param name="feedContent">转发微博的内容</param>
        /// <returns>返回填充好的一个Feed实例</returns>
        private Feed GetSimilarFeed(int currentPage, int i, TagNode similarFeedDiv, string originalAuthor, string feedContent)
        {
            Feed feed = new Feed();
            serialNumber++;
            feed.Page = currentPage;
            feed.Number = serialNumber;

            feed.ReFeedOrNot = true;

            #region 获取转发相似微博的作者
            NodeList feedAuthorNodeList = similarFeedDiv.Children.ExtractAllNodesThatMatch(feedAuthorFilter, true);
            if (feedAuthorNodeList.Size() == 1)
            {
                ATag feedAuthorTag = (ATag)feedAuthorNodeList[0].Children[1];
                string author = feedAuthorTag.GetAttribute("TITLE");
                feed.Author = author;
                //如果存在,则获取该作者的备注名
                INode remarkNameNode = feedAuthorTag.NextSibling;
                if (remarkNameNode.GetType().Equals(typeof(Span)))
                {
                    string remarkName = ((Span)remarkNameNode).StringText;
                    //去掉前后括号
                    remarkName = remarkName.Substring(1, remarkName.Length - 2);
                    feed.RemarkName = remarkName;
                }
            }
            else
            {
                Console.WriteLine("第" + i + "条微博中,判断转发类似微博的作者的标准出错!");
            }
            #endregion

            #region 获取转发相似微博的转发理由
            NodeList reFeedReasonNodeList = similarFeedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true);
            if (reFeedReasonNodeList.Size() == 1)
            {
                feed.ReFeedReason = GetContentFromChildren(feed, reFeedReasonNodeList[0], false);
            }
            else
            {
                Console.WriteLine("第" + i + "条微博中,判断转发类似微博的转发理由的标准出错!");
            }
            #endregion

            //因为是转发相似微博,所以原微博的作者和内容就通过参数传入
            feed.OriginalAuthor = originalAuthor;
            feed.Content = feedContent;

            //获取转发类似微博的发送地点信息
            feed.Location = GetLocationInfo(i, similarFeedDiv);
            //获取转发类似微博赞数
            feed.LikeCount = GetFeedLikeInfo(i, similarFeedDiv);
            //获取转发类似微博的转发数
            feed.ReFeedCount = GetFeedForwardCount(i, similarFeedDiv);
            //获取转发类似微博的评论数
            feed.CommentCount = GetFeedCommentCount(i, similarFeedDiv);
            //获取转发类似微博的发送时间
            feed.Time = GetFeedTimeInfo(i, similarFeedDiv);
            //获取转发类似微博的发送方式
            feed.Device = GetFeedSendTypeInfo(i, similarFeedDiv);

            return feed;
        }
예제 #2
0
 /// <summary>
 /// 辅助函数:以给定的HTML节点为根节点,把其子节点均作为微博内容提取出来
 /// </summary>
 /// <param name="feed">保存微博内容的Feed实例</param>
 /// <param name="node">作为根节点的HTML节点</param>
 /// <param name="hasEmTag">子节点中是否含有em标签(只有转发微博中含有em标签),若有em标签,hasEmTag为true,则start初始为false;反之为true</param>
 /// <returns>返回微博内容字符串</returns>
 private static string GetContentFromChildren(Feed feed, INode node, bool hasEmTag)
 {
     string content = "";
     bool start = !hasEmTag;
     for (int i = 0; i < node.Children.Size(); i++)
     {
         Type t = node.Children[i].GetType();
         if (start)
         {
             if (t.Equals(typeof(TextNode)))
             {
                 string str = ((TextNode)node.Children[i]).ToPlainTextString();
                 //遇到“//”说明微博内容提取完成;同时,还要提取“//”之后的一系列转发者
                 if (str.Length >= 2 && str.Substring(str.Length - 2).Equals("//"))
                 {
                     //去掉“//”
                     str = str.Substring(0, str.Length - 2);
                     content += str;
                     //string reFeedFrom = ((ATag)node.Children[i + 1]).StringText;
                     //if (reFeedFrom[0].Equals('@'))
                     //{
                     //    //去掉“@”
                     //    reFeedFrom = reFeedFrom.Substring(1, reFeedFrom.Length - 1);
                     //}
                     //获取转发链
                     string reFeedFrom = "";
                     for (int j = i + 1; j < node.Children.Size(); j++)
                     {
                         Type t2 = node.Children[j].GetType();
                         if (t2.Equals(typeof(ATag)) && ((ATag)node.Children[j]).Attributes.ContainsKey("USERCARD"))
                         {
                             string oneReFeeder = ((ATag)node.Children[j]).StringText;
                             if (oneReFeeder[0].Equals('@'))
                             {
                                 //去掉“@”
                                 oneReFeeder = oneReFeeder.Substring(1, oneReFeeder.Length - 1);
                                 reFeedFrom = reFeedFrom.Insert(0, oneReFeeder + " ");
                             }
                             else
                             {
                                 Console.WriteLine("获取转发链时出现错误!此前的转发链为" + reFeedFrom);
                             }
                         }
                     }
                     //最后,把reFeedFrom赋给feed.ReFeedFrom
                     feed.ReFeedFrom = reFeedFrom;
                     break;
                 }
                 content += str;
                 continue;
             }
             if (t.Equals(typeof(ATag)))
             {
                 ATag aTagNode = (ATag)node.Children[i];
                 //某些情况下,链接标记中不仅仅含有文本节点,还有span标记(以后说不定还会碰到跟奇葩的……),所以提取aTagNode的孩子节点中所有文本节点信息
                 NodeClassFilter textNodeFilter = new NodeClassFilter(typeof(TextNode));
                 NodeList nodeList = aTagNode.Children.ExtractAllNodesThatMatch(textNodeFilter, true);
                 for (int j = 0; j < nodeList.Size(); j++)
                 {
                     content += ((TextNode)nodeList[j]).ToPlainTextString();
                 }
                 continue;
             }
             if (t.Equals(typeof(TagNode)))
             {
                 content += ((TagNode)node.Children[i]).ToPlainTextString();
                 continue;
             }
             if (t.Equals(typeof(ImageTag)))
             {
                 content += ((ImageTag)node.Children[i]).GetAttribute("TITLE");
                 continue;
             }
         }
         else
         {
             if (t.Equals(typeof(TagNode)) && (((TagNode)(node.Children[i])).TagName.Equals("EM")))
             {
                 start = true;
             }
         }
     }
     //某些情况下最先/后数个字符竟然会是空格和换行符(ASCII码10),瞎了……
     char[] shouldRemove = { ' ', (char)10, '\r', '\n' };
     content = content.TrimStart(shouldRemove);
     content = content.TrimEnd(shouldRemove);
     return content;
 }
예제 #3
0
        /// <summary>
        /// 给定根结点,填充转发微博相关的各种信息
        /// </summary>
        /// <param name="i">该微博在所在页面中的流水号</param>
        /// <param name="feed">保存该微博的Feed实例</param>
        /// <param name="reFeedDiv">包含转发微博的div标记</param>
        /// <param name="feedDiv">包含原微博的div标记</param>
        private void GetReFeedInfo(int i, Feed feed, TagNode reFeedDiv, TagNode feedDiv)
        {
            //标记ReFeedOrNot为true,表明是转发微博
            feed.ReFeedOrNot = true;
            //标识是否出现“转发微博已被删除”的情况
            bool reFeedIsDeleted = false;

            #region 获取原微博作者
            NodeList reFeedOriginalAuthorNodeList = reFeedDiv.Children.ExtractAllNodesThatMatch(reFeedAuthorFilter, true);
            if (reFeedOriginalAuthorNodeList.Size() == 1)
            {
                INode reFeedOriginalAuthorNode = reFeedOriginalAuthorNodeList[0];
                //由于包含原微博作者的链接标记与得到的子div相对位置不定(某些情况下可能会有空文本标记,很奇怪= =),所以采用遍历判断标记类型的办法
                for (int j = 0; j < reFeedOriginalAuthorNode.Children.Size(); j++)
                {
                    INode reFeedOriginalAuthorCandidate = reFeedOriginalAuthorNode.Children[j];
                    if (reFeedOriginalAuthorCandidate.GetType().Equals(typeof(ATag)))
                    {
                        feed.OriginalAuthor = ((ATag)reFeedOriginalAuthorCandidate).GetAttribute("TITLE");
                        break;
                    }
                }
            }
            else
            {
                NodeList deletedFeedList = reFeedDiv.Children.ExtractAllNodesThatMatch(refeedDeletedFilter1, true);
                if (deletedFeedList.Size() > 0)
                {
                    reFeedIsDeleted = true;
                    feed.OriginalAuthor = "Unknown";
                }
                else
                {
                    Console.WriteLine("第" + i + "条微博中,判断转发微博作者的标准出错!");
                }
            }
            #endregion

            #region 获取原微博内容
            NodeList reFeedContentNodeList = reFeedDiv.Children.ExtractAllNodesThatMatch(reFeedContentFilter, true);
            if (reFeedContentNodeList.Size() == 1)
            {
                //不清楚em是什么类型的节点,所以直接传递reFeedContentNodeList.Children给函数,让函数对其中每个元素进行遍历处理
                //在个人主页中,好像又没有em节点了= =瞎了……
                feed.Content = GetContentFromChildren(feed, reFeedContentNodeList[0], false);
            }
            else
            {
                if (reFeedIsDeleted)
                {
                    feed.Content = "微博已删除";
                }
                else
                {
                    Console.WriteLine("第" + i + "条微博中,判断转发微博内容的标准出错!");
                }
            }
            #endregion

            #region 获取本条微博内容作为转发理由
            NodeList reFeedReasonNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true);
            //注意:如果含有“还有X条对原微博的转发”的内容,那么此处reFeedReasonNodeList的数量应该等于(1 + similarFeedCount),但是考虑到之前已经有了多次判断相等的过程,所以此处直接调用第一个(即下标为0的元素)即可
            feed.ReFeedReason = GetContentFromChildren(feed, reFeedReasonNodeList[0], false);
            #endregion
        }
예제 #4
0
        /// <summary>
        /// 从网页版微博中获取微博信息
        /// </summary>
        /// <param name="currentPage">爬得的微博所在页面序号</param>
        /// <param name="feedList">保存爬得的微博的数组</param>
        public void GetInfoFromHtml(int currentPage, List<Feed> feedList)
        {
            foreach (string htmlContent in htmlContentList)
            {
                Lexer lexer = new Lexer(htmlContent);
                Parser parser = new Parser(lexer);
                //获取包含每条微博的div标记列表
                NodeList feedNodeList = parser.Parse(feedFilter);
                for (int i = 0; i < feedNodeList.Size(); i++)
                {
                    serialNumber++;
                    Feed feed = new Feed();
                    feed.Page = currentPage;
                    feed.Number = serialNumber;
                    //类似微博转发的数量
                    int similarFeedCount = 0;

                    //取得第i条微博的div
                    TagNode feedDiv = (TagNode)feedNodeList[i];

                    //判断是否含有“还有X条对原微博的转发”
                    NodeList similarfeedCountNodeList = feedDiv.Children.ExtractAllNodesThatMatch(similarFeedCountFilter, true);
                    switch (similarfeedCountNodeList.Size())
                    {
                        case 1:
                            //说明存在“还有X条对原微博的转发”的div;此处看起来此HTML解析器不认<b>标记,而把其中包含的内容作为其下一个兄弟节点= =
                            similarFeedCount = Int32.Parse(((TextNode)(similarfeedCountNodeList[0].NextSibling)).ToPlainTextString());
                            break;
                        case 0:
                            //说明不存在“还有X条对原微博的转发”
                            similarFeedCount = 0;
                            break;
                        default:
                            Console.WriteLine("第" + i + "条微博中,判断是否含有类似微博转发的标准出错!");
                            break;
                    }

                    #region 获取微博作者
                    NodeList feedAuthorNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedAuthorFilter, true);
                    //在整个一条微博的范围(即一个feedDiv)内,满足feedAuthorFilter过滤器的div节点数量应该是本条微博作者加上转发类似微博的作者(如果有的话),所以是(1 + similarFeedCount)
                    if (feedAuthorNodeList.Size() == (1 + similarFeedCount))
                    {
                        ATag feedAuthorTag = (ATag)feedAuthorNodeList[0].Children[0];
                        string author = feedAuthorTag.GetAttribute("TITLE");
                        feed.Author = author;
                        //如果存在,则获取该作者的备注名
                        INode remarkNameNode = feedAuthorTag.NextSibling;
                        if (remarkNameNode.GetType().Equals(typeof(Span)))
                        {
                            string remarkName = ((Span)remarkNameNode).StringText;
                            //去掉前后括号
                            remarkName = remarkName.Substring(1, remarkName.Length - 2);
                            feed.RemarkName = remarkName;
                        }
                    }
                    else
                    {
                        //从首页爬取微博时,微博来自不同的被关注者,所以是有微博作者的;而从个人主页爬取微博时,由于所有微博作者都是该用户,所以是没有微博作者相关节点的
                        if (!user.NickName.Equals(""))
                        {
                            feed.Author = user.NickName;
                            feed.RemarkName = user.RemarkName;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条微博中,判断微博作者的标准出错!");
                        }
                    }
                    #endregion

                    #region 获取转发微博
                    NodeList reFeedNodeList = feedDiv.Children.ExtractAllNodesThatMatch(reFeedFilter, true);
                    //转发微博;(1 + similarFeedCount)的理由和获取微博作者时相同
                    if (reFeedNodeList.Size() == (1 + similarFeedCount))
                    {
                        //获取转发微博的div
                        TagNode reFeedDiv = (TagNode)reFeedNodeList[0];
                        //先获取本次转发微博的相关信息
                        GetReFeedInfo(i, feed, reFeedDiv, feedDiv);
                        #region 考虑“还有X条对原微博的转发”的情况
                        if (similarFeedCount > 0)
                        {
                            NodeList similarFeedNodeList = feedDiv.Children.ExtractAllNodesThatMatch(similarFeedFilter, true);
                            if (similarFeedNodeList.Size() == similarFeedCount)
                            {
                                for (int j = 0; j < similarFeedCount; j++)
                                {
                                    feedList.Add(GetSimilarFeed(currentPage, i, (TagNode)similarFeedNodeList[j], feed.OriginalAuthor, feed.Content));
                                }
                            }
                            else
                            {
                                Console.WriteLine("第" + i + "条微博中,获取转发微博的数量出错!");
                            }
                        }
                        #endregion
                    }
                    else
                    {
                        if (reFeedNodeList.Size() == 0)
                        {
                            //获取本条微博内容作为微博内容
                            NodeList feedContentNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true);
                            if (feedContentNodeList.Size() == 1)
                            {
                                feed.Content = GetContentFromChildren(feed, feedContentNodeList[0], false);
                                #region 由于存在某些情况,转发微博被删除后更不过滤不到reFeedDiv,所以需要再次检查是否存在已删除的转发微博
                                NodeList deletedFeedList = feedDiv.Children.ExtractAllNodesThatMatch(refeedDeletedFilter2, true);
                                if (deletedFeedList.Size() > 0)
                                {
                                    feed.OriginalAuthor = "Unknown";
                                    feed.ReFeedOrNot = true;
                                    feed.ReFeedReason = feed.Content;
                                    feed.Content = "微博已删除";
                                }
                                #endregion
                            }
                            else
                            {
                                Console.WriteLine("第" + i + "条微博中,判断微博内容的标准出错!");
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条微博中,判断转发微博的标准出错!");
                        }
                    }
                    #endregion

                    //获取包含微博发送地点的div
                    feed.Location = GetLocationInfo(i, feedDiv);
                    //获取包含微博“赞”数的标记
                    feed.LikeCount = GetFeedLikeInfo(i, feedDiv);
                    //获取包含微博转发数的链接标记
                    feed.ReFeedCount = GetFeedForwardCount(i, feedDiv);
                    //获取包含微博评论数的链接标记
                    feed.CommentCount = GetFeedCommentCount(i, feedDiv);
                    //获取包含微博发送时间的链接标记
                    feed.Time = GetFeedTimeInfo(i, feedDiv);
                    //获取包含微博发送设备的链接标记
                    feed.Device = GetFeedSendTypeInfo(i, feedDiv);

                    feedList.Add(feed);
                }
            }
        }