public override void BuildFilter <T>(object query, object options, ref FilterContainer container)
        {
            var childQuery = query as IChildQuery;

            if (childQuery?.ChildQuery == null)
            {
                return;
            }

            container &= new HasChildFilter {
                Query = _queryBuilder.BuildQuery <T>(childQuery.ChildQuery, options),
                Type  = childQuery.ChildQuery.Type
            };
        }
예제 #2
0
        /// <summary>
        /// 从移动版微博中获取微博信息
        /// </summary>
        /// <param name="index">要获取页面的页面序号</param>
        /// <param name="feedList">保存微博的Feed数组</param>
        public void GetInfoFromHtml(int index, List <Feed> feedList)
        {
            Lexer  lexer  = new Lexer(htmlContent);
            Parser parser = new Parser(lexer);
            //移动版网页中,爬取个人主页的微博,过滤出包含用户名称和信息的div
            HasAttributeFilter userFilter = new HasAttributeFilter("class", "u");
            //移动版网页中,每条微博的div都含有class=c的属性
            HasAttributeFilter feedFilter = new HasAttributeFilter("class", "c");
            //移动版网页中,每条转发微博的第一个子div中都含有带class=c的属性的span标记
            HasAttributeFilter refeedFilter = new HasAttributeFilter("class", "cmt");
            //移动版网页中,每条微博内容都存于带class="ctt"的属性的span标记内
            HasAttributeFilter feedContentFilter = new HasAttributeFilter("class", "ctt");
            //移动版网页中,每条微博的发送时间和发送方式都存于带class="ct"的属性的span标记内
            HasAttributeFilter feedTimeFilter = new HasAttributeFilter("class", "ct");
            //在移动版网页中过滤出包含每条微博的转发理由的div。注意:内层的HasChildFilter只过滤出了包含文字“转发理由:”的span标记,所以需要再套一层HasChildFilter才能得到包含span标记的div
            HasChildFilter reFeedReasonFilter = new HasChildFilter(new HasChildFilter(new StringFilter("转发理由:")));

            //若user.NickName为空,则说明是第一次爬取该个人主页的微博,需要获得用户信息
            if (user.NickName.Equals(""))
            {
                #region 爬取个人主页的微博,首先获得用户信息
                NodeList userNodeList = parser.Parse(userFilter);
                if (userNodeList.Size() == 1)
                {
                    NodeList userDetailNodeList = userNodeList[0].Children.ExtractAllNodesThatMatch(feedContentFilter, true);//此处只是借用feedContentFilter过滤器,因为要过滤的节点正好符合这个过滤器
                    if (userDetailNodeList.Size() >= 2)
                    {
                        //获取微博用户名
                        if (userDetailNodeList[0].Children[0].GetType().Equals(typeof(TextNode)))
                        {
                            string nickName = ((TextNode)userDetailNodeList[0].Children[0]).ToPlainTextString();
                            //尝试把备注名提取出来
                            if (nickName.Contains("("))
                            {
                                int start = nickName.IndexOf('(');
                                int end   = nickName.IndexOf(')');
                                if (end > start)
                                {
                                    string remarkName = nickName.Substring(start + 1, end - start - 1);
                                    user.RemarkName = remarkName;
                                }
                                user.NickName = nickName.Substring(0, start);
                            }
                            else
                            {
                                user.NickName = nickName;
                            }
                        }
                        else
                        {
                            Console.WriteLine("获取微博用户名出错!");
                        }
                        //获取自我描述
                        user.SelfIntroduction = ((Span)userDetailNodeList[1]).StringText;
                    }
                    else
                    {
                        Console.WriteLine("获取包含微博用户名和自我描述的div出错!");
                    }
                }
                else
                {
                    Console.WriteLine("获取包含微博用户信息的div出错!");
                }
                //注意:重复使用parser前一定要调用Reset方法
                parser.Reset();
                #endregion
            }
            NodeList feedNodeList = parser.Parse(feedFilter);
            int      count        = 0;
            for (int i = 0; i < feedNodeList.Size(); i++)
            {
                //保存该条微博
                Feed feed = new Feed();
                feed.Page   = index;
                feed.Number = i + 1;
                //记录微博条数
                count++;

                //取得第i条微博的div;
                //把一个node转为具体的TagNode,以便取得其中的属性值
                TagNode feedNode = (TagNode)feedNodeList[i];
                //注意:获取某个属性的值时,作为键值的属性需要大写,如“ID”
                if (feedNode.Attributes.Contains("ID"))//若ID属性不存在,则说明不是这个节点不是微博内容
                {
                    //通过分析移动版网页可知,
                    //每条微博的div中的一个子div中一般包含微博内容;
                    //第二个子div包含图片和发送时间等
                    //若是转发微博,则有第三个子div,其中包含转发理由、转发来源和时间等

                    //第一个子div
                    TagNode feedFirstDiv = (TagNode)feedNode.Children[0];
                    //找出包含转发微博的标记
                    NodeList reFeedList = feedFirstDiv.Children.ExtractAllNodesThatMatch(refeedFilter, true);
                    if (reFeedList.Size() > 0)                                                                                              //实践表明,class="cmt"属性往往不止被转发微博所使用
                    {
                        if (HttpUtility.HtmlDecode(((TextNode)reFeedList[0].Children[0]).ToPlainTextString()).Substring(0, 2).Equals("转发")) //为了保证取到的是转发微博的来源,故加这一条辅助判断
                        {
                            feed.ReFeedOrNot    = true;
                            feed.OriginalAuthor = HttpUtility.HtmlDecode(((ATag)reFeedList[0].Children[1]).StringText);
                            //找到包含转发理由的子div
                            NodeList reFeedReasonList = feedNode.Children.ExtractAllNodesThatMatch(reFeedReasonFilter, true);
                            if (reFeedReasonList.Size() == 1)
                            {
                                TagNode reFeedReasonDiv = (TagNode)reFeedReasonList[0];
                                //在包含转发理由的子div中,第一个子节点总为span标记,为文本“转发理由”四字
                                //第二个子节点开始的一些系列子节点组成保存转发理由的内容,可能有文本,有链接(@某人)
                                //判断转发理由结束的几个条件:若为文本节点,则最后两个字符应为“//”;若为链接节点,则其文本应为“赞[X]”(或其链接为“http://weibo.cn/attitude/……”)
                                for (int j = 1; j < reFeedReasonDiv.Children.Size(); j++)
                                {
                                    Type t = reFeedReasonDiv.Children[j].GetType();
                                    if (t.Equals(typeof(TextNode)))
                                    {
                                        string str = HttpUtility.HtmlDecode(((TextNode)reFeedReasonDiv.Children[j]).ToPlainTextString());
                                        if (str.Length >= 2 && str.Substring(str.Length - 2, 2).Equals("//"))
                                        {
                                            feed.ReFeedReason += str.Substring(0, str.Length - 2);
                                            feed.ReFeedFrom    = HttpUtility.HtmlDecode(((ATag)reFeedReasonDiv.Children[j + 1]).StringText);
                                            if (feed.ReFeedFrom.Substring(0, 1).Equals("@"))//去掉上一个转发者前的@符号
                                            {
                                                feed.ReFeedFrom = feed.ReFeedFrom.Substring(1);
                                            }
                                            break;
                                        }
                                        else
                                        {
                                            feed.ReFeedReason += str;
                                        }
                                        continue;
                                    }
                                    if (t.Equals(typeof(ATag)))
                                    {
                                        string str = HttpUtility.HtmlDecode(((ATag)reFeedReasonDiv.Children[j]).StringText);
                                        if (str.Substring(0, 1).Equals("赞"))
                                        {
                                            feed.ReFeedFrom = feed.OriginalAuthor;
                                            break;
                                        }
                                        else
                                        {
                                            feed.ReFeedReason += str;
                                        }
                                        continue;
                                    }
                                }
                            }
                            else
                            {
                                Console.WriteLine("好像找到不止一个转发理由?!");
                            }
                        }
                        else
                        {
                            Console.WriteLine("糟糕!第" + count + "条微博中,找不到转发微博的来源!");
                        }
                    }
                    //找出包含微博正文的标记
                    NodeList feedContentList = feedFirstDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true);
                    switch (feedContentList.Size())
                    {
                    case 1:
                        //微博正文包含在一个span标记内
                        Span feedContentListNode = (Span)feedContentList[0];
                        //因为微博正文是不确定数量的文本和链接(如@某人)的组合,因此对于span的每个子节点,根据其类型(是文本节点还是链接节点),分别处理
                        for (int j = 0; j < feedContentListNode.Children.Size(); j++)
                        {
                            Type t = feedContentListNode.Children[j].GetType();
                            if (t.Equals(typeof(TextNode)))
                            {
                                feed.Content += HttpUtility.HtmlDecode(((TextNode)feedContentListNode.Children[j]).ToPlainTextString());
                                continue;
                            }
                            if (t.Equals(typeof(ATag)))
                            {
                                feed.Content += HttpUtility.HtmlDecode(((ATag)feedContentListNode.Children[j]).StringText);
                                continue;
                            }
                        }
                        break;

                    default:
                        Console.WriteLine("糟糕!第" + count + "条微博中,取得微博正文的判断标准出错了!");
                        break;
                    }

                    //从整个feed的范围内,找出包含微博发送时间的标记
                    NodeList feedTimeList = feedNode.Children.ExtractAllNodesThatMatch(feedTimeFilter, true);
                    switch (feedTimeList.Size())
                    {
                    case 1:
                        string time = HttpUtility.HtmlDecode(((TextNode)((Span)feedTimeList[0]).Children[0]).ToHtml());
                        feed.Time = Program.GetTime(time);
                        if (feedTimeList[0].Children.Size() > 1)
                        {
                            feed.Device = HttpUtility.HtmlDecode(((ATag)((Span)feedTimeList[0]).Children[1]).StringText);
                        }
                        //从包含微博发送时间的标记往前推,便是“赞”、“转发”和“评论”的标记
                        INode node = feedTimeList[0];
                        for (int j = 0; j < 9; j++)
                        {
                            node = node.PreviousSibling;
                            switch (j)
                            {
                            case 4:
                                //评论
                                string strCommentCount = ((ATag)node).StringText;
                                feed.CommentCount = Int32.Parse(strCommentCount.Substring(3, strCommentCount.Length - 4));
                                break;

                            case 6:
                                //转发
                                string strReFeedCount = ((ATag)node).StringText;
                                feed.ReFeedCount = Int32.Parse(strReFeedCount.Substring(3, strReFeedCount.Length - 4));
                                break;

                            case 8:
                                //赞
                                string strLikeCount = ((ATag)node).StringText;
                                feed.LikeCount = Int32.Parse(strLikeCount.Substring(2, strLikeCount.Length - 3));
                                break;

                            default:
                                break;
                            }
                        }
                        break;

                    default:
                        Console.WriteLine("糟糕!第" + count + "条微博中,取得微博时间的判断标准出错了!");
                        break;
                    }
                    feedList.Add(feed);
                }
            }
        }