コード例 #1
0
        public void GetInfoFromHtml(int currentPage)
        {
            Lexer    lexer       = new Lexer(currentHtml);
            Parser   parser      = new Parser(lexer);
            NodeList poiHeadList = parser.Parse(poiListFilter);

            if (poiHeadList.Count == 1)
            {
                NodeList poiNodeList = poiHeadList[0].Children.ExtractAllNodesThatMatch(poiFilter, false);
                int      numCount    = 0;
                for (int i = 0; i < poiNodeList.Count; i++)
                {
                    POI poi = new POI();
                    DefinitionListBullet poiNode = (DefinitionListBullet)poiNodeList[i];
                    if (poiNode.TagName.Equals("DD"))
                    {
                        numCount++;
                        poi.Page   = currentPage;
                        poi.Number = numCount;
                        #region 获取口味、环境和服务评分,以及获取星级
                        NodeList tasteNodeList       = poiNode.Children.ExtractAllNodesThatMatch(tasteFilter, true);
                        NodeList environmentNodeList = poiNode.Children.ExtractAllNodesThatMatch(environmentFilter, true);
                        NodeList serviceNodeList     = poiNode.Children.ExtractAllNodesThatMatch(serviceFilter, true);
                        if (tasteNodeList.Count == 1 && environmentNodeList.Count == 1 && serviceNodeList.Count == 1)
                        {
                            Span spanNode = (Span)tasteNodeList[0];
                            if (!spanNode.ToPlainTextString().Equals("-"))
                            {
                                poi.TasteRemark = Int32.Parse(spanNode.ToPlainTextString());
                            }
                            spanNode = (Span)environmentNodeList[0];
                            if (!spanNode.ToPlainTextString().Equals("-"))
                            {
                                poi.EnvironmentRemark = Int32.Parse(spanNode.ToPlainTextString());
                            }
                            spanNode = (Span)serviceNodeList[0];
                            if (!spanNode.ToPlainTextString().Equals("-"))
                            {
                                poi.ServiceRemark = Int32.Parse(spanNode.ToPlainTextString());
                            }
                            #region 获取星级
                            INode rankNodeOfParent = spanNode.Parent.NextSibling.NextSibling;
                            if (rankNodeOfParent.Children != null && rankNodeOfParent.Children.Count >= 1)
                            {
                                INode rankNodeCandidate = rankNodeOfParent.Children[0];
                                if (rankNodeCandidate.GetType().Equals(typeof(Span)))
                                {
                                    Span   rankNode = (Span)rankNodeCandidate;
                                    string rank     = rankNode.GetAttribute("TITLE");
                                    if (rank.Contains("五"))
                                    {
                                        poi.Rank = 5;
                                    }
                                    else
                                    {
                                        if (rank.Contains("四"))
                                        {
                                            poi.Rank = 4;
                                        }
                                        else
                                        {
                                            if (rank.Contains("三"))
                                            {
                                                poi.Rank = 3;
                                            }
                                            else
                                            {
                                                if (rank.Contains("二"))
                                                {
                                                    poi.Rank = 2;
                                                }
                                                else
                                                {
                                                    if (rank.Contains("一"))
                                                    {
                                                        poi.Rank = 1;
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                            #endregion
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断口味、环境和服务的标准出错!");
                        }
                        #endregion
                        #region 获取平均消费
                        NodeList averageNodeList = poiNode.Children.ExtractAllNodesThatMatch(averageFilter, true);
                        if (averageNodeList.Count == 1)
                        {
                            INode averageNode = averageNodeList[0];
                            if (averageNode.NextSibling.NextSibling.GetType().Equals(typeof(TextNode)))
                            {
                                string cost = ((TextNode)averageNode.NextSibling.NextSibling).ToPlainTextString();
                                poi.AverageCost = Int32.Parse(cost);
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断平均消费的标准出错!");
                        }
                        #endregion
                        #region 获取点评数
                        NodeList commentNodeList = poiNode.Children.ExtractAllNodesThatMatch(commentFilter, true);
                        if (commentNodeList.Count == 1)
                        {
                            INode commentNode = commentNodeList[0];
                            if (commentNode.GetType().Equals(typeof(ATag)))
                            {
                                string commentNum = ((ATag)commentNode).StringText;
                                if (commentNum.Substring(commentNum.Length - 3, 3).Equals("封点评"))
                                {
                                    commentNum = commentNum.Substring(0, commentNum.Length - 3);
                                }
                                poi.CommentCount = Int32.Parse(commentNum);
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断点评数的标准出错!");
                        }
                        #endregion
                        #region 获取店名
                        NodeList nameNodeList = poiNode.Children.ExtractAllNodesThatMatch(nameFilter, true);
                        if (nameNodeList.Count == 1)
                        {
                            INode nameNode = nameNodeList[0];
                            if (nameNode.GetType().Equals(typeof(ATag)))
                            {
                                poi.Name = ((ATag)nameNode).StringText;
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断店名的标准出错!");
                        }
                        #endregion
                        #region 获取地址和电话
                        NodeList addressNodeList = poiNode.Children.ExtractAllNodesThatMatch(addressFilter, true);
                        if (addressNodeList.Count == 1)
                        {
                            NodeList districtNodeList = addressNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                            if (districtNodeList.Count == 1)
                            {
                                ATag   districtTag = (ATag)districtNodeList[0];
                                string address     = districtTag.ToPlainTextString();
                                if (districtTag.NextSibling.GetType().Equals(typeof(TextNode)))
                                {
                                    TextNode detailAddressNode = (TextNode)districtTag.NextSibling;
                                    string   detailAddress     = detailAddressNode.ToPlainTextString();
                                    detailAddress = detailAddress.Trim();
                                    string phoneStr = detailAddress.Substring(detailAddress.Length - 8, 8);
                                    poi.Phone = phoneStr;
                                    address  += detailAddress.Substring(0, detailAddress.Length - 8);
                                }
                                char[] removeChrVector = { ' ', '\n', '\t' };
                                address = address.Trim(removeChrVector);
                                foreach (char c in removeChrVector)
                                {
                                    address = address.Replace(c.ToString(), "");
                                }
                                poi.Address = address;
                            }
                            else
                            {
                                Console.WriteLine("第" + i + "条POI中,判断含地址的<a>标记的标准出错!");
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断地址的标准出错!");
                        }
                        #endregion
                        #region 获取标签
                        NodeList tagsNodeList = poiNode.Children.ExtractAllNodesThatMatch(tagsFilter, true);
                        if (tagsNodeList.Count == 1)
                        {
                            INode tagsNode = tagsNodeList[0];
                            if (tagsNode.Children != null)
                            {
                                for (int j = 0; j < tagsNode.Children.Count; j++)
                                {
                                    INode node = tagsNode.Children[j];
                                    if (node.GetType().Equals(typeof(ATag)))
                                    {
                                        poi.Tags.Add(node.ToPlainTextString());
                                    }
                                }
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断标签的标准出错!");
                        }
                        #endregion
                        poiList.Add(poi);
                    }
                }
            }
            else
            {
                Console.WriteLine("获取POI列表出错");
            }
        }
コード例 #2
0
        /// <summary>
        /// 从网页版微博中获取微博信息
        /// </summary>
        /// <param name="fansList">保存爬得的粉丝数组</param>
        public void GetInfoFromHtml(List <Fan> fansList)
        {
            Lexer  lexer  = new Lexer(currentHtmlContent);
            Parser parser = new Parser(lexer);
            //获取包含每条微博的div标记列表
            NodeList fansNodeList = parser.Parse(fanFilter);

            for (int i = 0; i < fansNodeList.Size(); i++)
            {
                Fan fan = new Fan();
                //获取包含一个粉丝的<li>标记
                Bullet fanBullet = (Bullet)fansNodeList[i];

                #region 获取该粉丝头像
                NodeList fanPortraitNodeList = fanBullet.Children.ExtractAllNodesThatMatch(portraitFilter, true);
                if (fanPortraitNodeList.Size() == 1)
                {
                    Div      fanPortraitDiv = (Div)fanPortraitNodeList[0];
                    NodeList imgNodeList    = fanPortraitDiv.Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ImageTag)), true);
                    if (imgNodeList.Size() == 1)
                    {
                        ImageTag imgNode = (ImageTag)imgNodeList[0];
                        if (imgNode.Attributes.ContainsKey("SRC") && imgNode.Attributes.ContainsKey("ALT"))
                        {
                            string imgUrl  = imgNode.GetAttribute("SRC");
                            string imgName = imgNode.GetAttribute("ALT");
                            fan.Name = imgName;
                            WebClient wc = new WebClient();//使用WebClient是因为下载用户头像不用登录cookie
                            wc.DownloadFileAsync(new Uri(imgUrl), @"portrait\" + imgName + ".jpg");
                            wc.DownloadFileCompleted += wc_DownloadFileCompleted;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中,<img>标记缺少必要的属性!");
                        }
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取img标记出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝头像的标准出错!");
                }
                #endregion

                #region 获取该粉丝的关注数/粉丝数/微博数
                NodeList fanConnectNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanConnectFilter, true);
                if (fanConnectNodeList.Size() == 1)
                {
                    NodeList ATagList = fanConnectNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (ATagList.Size() == 3)
                    {
                        for (int j = 0; j < 3; j++)
                        {
                            ATag aTag = (ATag)ATagList[j];
                            switch (j)
                            {
                            case 0:
                                if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("follow"))
                                {
                                    fan.FollowCount = Int32.Parse(aTag.StringText);
                                }
                                else
                                {
                                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝的关注数出错!");
                                }
                                break;

                            case 1:
                                if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("fans"))
                                {
                                    fan.FansCount = Int32.Parse(aTag.StringText);
                                }
                                else
                                {
                                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝的粉丝数出错!");
                                }
                                break;

                            default:
                                fan.FeedsCount = Int32.Parse(aTag.StringText);
                                break;
                            }
                        }
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的数量出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的标准出错!");
                }
                #endregion

                #region 获取该粉丝的简介信息
                NodeList fanInfoNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanInfoFilter, true);
                if (fanInfoNodeList.Size() == 1)
                {
                    //Console.WriteLine(fanInfoNodeList[0].Parent.ToHtml());
                    Div    fanInfoDiv = (Div)fanInfoNodeList[0];
                    string intro      = fanInfoDiv.StringText;
                    if (intro.Substring(0, 2).Equals("简介"))
                    {
                        fan.Introduction = intro.Substring(3, intro.Length - 3).Replace("\n", " ").Replace("\t", " ");
                    }
                }
                else
                {
                    if (fanInfoNodeList.Size() == 0)
                    {
                        fan.Introduction = "";
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝简介的标准出错!");
                    }
                }
                #endregion

                #region 获取该粉丝的UserID、地点和性别信息;校验该粉丝的用户名信息
                NodeList fanLocationNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanNameFilter, true);
                if (fanLocationNodeList.Size() == 1)
                {
                    //获取粉丝的UserID信息;校验该粉丝的用户名信息
                    NodeList aTagNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (aTagNodeList.Size() >= 1)
                    {
                        ATag nameNode = (ATag)aTagNodeList[0];
                        if (nameNode.Attributes.ContainsKey("USERCARD") && nameNode.Attributes.ContainsKey("HREF"))
                        {
                            //获取粉丝的UserID信息
                            string uidStr = nameNode.GetAttribute("USERCARD");
                            if (uidStr.Substring(0, 3).Equals("id="))
                            {
                                fan.UserID = uidStr.Substring(3, uidStr.Length - 3);
                            }

                            //获取粉丝的微博链接
                            string linkUrl = nameNode.GetAttribute("HREF");
                            fan.LinkURL = "http://www.weibo.com" + linkUrl;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中,包含用户id和链接的<a>标记中缺少必要的属性!");
                        }
                        //校验该粉丝的用户名信息
                        if (!nameNode.StringText.Equals(fan.Name))
                        {
                            Console.WriteLine("第" + i + "个粉丝中,用户名与用户头像文字描述不一致!");
                        }
                    }

                    //获取粉丝的性别和地点信息
                    NodeList locationNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "addr"), true);
                    if (locationNodeList.Size() == 1)
                    {
                        string locationStr = "";
                        for (int j = 0; j < locationNodeList[0].Children.Size(); j++)
                        {
                            INode node = locationNodeList[0].Children[j];
                            if (node.GetType().Equals(typeof(TextNode)))
                            {
                                TextNode tNode = (TextNode)node;
                                locationStr += tNode.ToPlainTextString();
                            }
                            if (node.GetType().Equals(typeof(TagNode)))
                            {
                                TagNode tNode = (TagNode)node;
                                if (tNode.Attributes.ContainsKey("CLASS"))
                                {
                                    if (tNode.GetAttribute("CLASS").Contains("female"))//必须先female,因为female中也含有male,如果male在前,则所有用户均符合该条件了= =
                                    {
                                        fan.Gender = "female";
                                    }
                                    else
                                    {
                                        if (tNode.GetAttribute("CLASS").Contains("male"))
                                        {
                                            fan.Gender = "male";
                                        }
                                        else
                                        {
                                            fan.Gender = "unknown";
                                            Console.WriteLine("第" + i + "个粉丝性别不明!");
                                        }
                                    }
                                }
                            }
                        }
                        fan.Location = locationStr.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝地点的标准出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取该粉丝的UserID、地点和性别信息的标准出错!");
                }
                #endregion

                #region 获取该粉丝关注用户的方式
                NodeList followMethodNodeList = fanBullet.Children.ExtractAllNodesThatMatch(followMethodFilter, true);
                if (followMethodNodeList.Size() == 1)
                {
                    NodeList methodNodeList = followMethodNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                    if (methodNodeList.Size() == 1)
                    {
                        ATag methodNode = (ATag)methodNodeList[0];
                        fan.FollowMethod = methodNode.StringText.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的数量出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的标准出错!");
                }
                #endregion

                fansList.Add(fan);
            }
        }