/// <summary> /// 从网页版微博中获取微博信息 /// </summary> /// <param name="fansList">保存爬得的粉丝数组</param> public void GetInfoFromHtml(List<Fan> fansList) { Lexer lexer = new Lexer(currentHtmlContent); Parser parser = new Parser(lexer); //获取包含每条微博的div标记列表 NodeList fansNodeList = parser.Parse(fanFilter); for (int i = 0; i < fansNodeList.Size(); i++) { Fan fan = new Fan(); //获取包含一个粉丝的<li>标记 Bullet fanBullet = (Bullet)fansNodeList[i]; #region 获取该粉丝头像 NodeList fanPortraitNodeList = fanBullet.Children.ExtractAllNodesThatMatch(portraitFilter, true); if (fanPortraitNodeList.Size() == 1) { Div fanPortraitDiv = (Div)fanPortraitNodeList[0]; NodeList imgNodeList = fanPortraitDiv.Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ImageTag)), true); if (imgNodeList.Size() == 1) { ImageTag imgNode = (ImageTag)imgNodeList[0]; if (imgNode.Attributes.ContainsKey("SRC") && imgNode.Attributes.ContainsKey("ALT")) { string imgUrl = imgNode.GetAttribute("SRC"); string imgName = imgNode.GetAttribute("ALT"); fan.Name = imgName; WebClient wc = new WebClient();//使用WebClient是因为下载用户头像不用登录cookie wc.DownloadFileAsync(new Uri(imgUrl), @"portrait\" + imgName + ".jpg"); wc.DownloadFileCompleted += wc_DownloadFileCompleted; } else { Console.WriteLine("第" + i + "个粉丝中,<img>标记缺少必要的属性!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取img标记出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝头像的标准出错!"); } #endregion #region 获取该粉丝的关注数/粉丝数/微博数 NodeList fanConnectNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanConnectFilter, true); if (fanConnectNodeList.Size() == 1) { NodeList ATagList = fanConnectNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true); if (ATagList.Size() == 3) { for (int j = 0; j < 3; j++) { ATag aTag = (ATag)ATagList[j]; switch (j) { case 0: if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("follow")) { fan.FollowCount = Int32.Parse(aTag.StringText); } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝的关注数出错!"); } break; case 1: if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("fans")) { fan.FansCount = Int32.Parse(aTag.StringText); } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝的粉丝数出错!"); } break; default: fan.FeedsCount = Int32.Parse(aTag.StringText); break; } } } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的数量出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的标准出错!"); } #endregion #region 获取该粉丝的简介信息 NodeList fanInfoNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanInfoFilter, true); if (fanInfoNodeList.Size() == 1) { //Console.WriteLine(fanInfoNodeList[0].Parent.ToHtml()); Div fanInfoDiv = (Div)fanInfoNodeList[0]; string intro = fanInfoDiv.StringText; if (intro.Substring(0, 2).Equals("简介")) { fan.Introduction = intro.Substring(3, intro.Length - 3).Replace("\n", " ").Replace("\t", " "); } } else { if (fanInfoNodeList.Size() == 0) { fan.Introduction = ""; } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝简介的标准出错!"); } } #endregion #region 获取该粉丝的UserID、地点和性别信息;校验该粉丝的用户名信息 NodeList fanLocationNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanNameFilter, true); if (fanLocationNodeList.Size() == 1) { //获取粉丝的UserID信息;校验该粉丝的用户名信息 NodeList aTagNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true); if (aTagNodeList.Size() >= 1) { ATag nameNode = (ATag)aTagNodeList[0]; if (nameNode.Attributes.ContainsKey("USERCARD") && nameNode.Attributes.ContainsKey("HREF")) { //获取粉丝的UserID信息 string uidStr = nameNode.GetAttribute("USERCARD"); if (uidStr.Substring(0, 3).Equals("id=")) { fan.UserID = uidStr.Substring(3, uidStr.Length - 3); } //获取粉丝的微博链接 string linkUrl = nameNode.GetAttribute("HREF"); fan.LinkURL = "http://www.weibo.com" + linkUrl; } else { Console.WriteLine("第" + i + "个粉丝中,包含用户id和链接的<a>标记中缺少必要的属性!"); } //校验该粉丝的用户名信息 if (!nameNode.StringText.Equals(fan.Name)) { Console.WriteLine("第" + i + "个粉丝中,用户名与用户头像文字描述不一致!"); } } //获取粉丝的性别和地点信息 NodeList locationNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "addr"), true); if (locationNodeList.Size() == 1) { string locationStr = ""; for (int j = 0; j < locationNodeList[0].Children.Size(); j++) { INode node = locationNodeList[0].Children[j]; if (node.GetType().Equals(typeof(TextNode))) { TextNode tNode = (TextNode)node; locationStr += tNode.ToPlainTextString(); } if (node.GetType().Equals(typeof(TagNode))) { TagNode tNode = (TagNode)node; if (tNode.Attributes.ContainsKey("CLASS")) { if (tNode.GetAttribute("CLASS").Contains("female"))//必须先female,因为female中也含有male,如果male在前,则所有用户均符合该条件了= = { fan.Gender = "female"; } else { if (tNode.GetAttribute("CLASS").Contains("male")) { fan.Gender = "male"; } else { fan.Gender = "unknown"; Console.WriteLine("第" + i + "个粉丝性别不明!"); } } } } } fan.Location = locationStr.Trim(); } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝地点的标准出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取该粉丝的UserID、地点和性别信息的标准出错!"); } #endregion #region 获取该粉丝关注用户的方式 NodeList followMethodNodeList = fanBullet.Children.ExtractAllNodesThatMatch(followMethodFilter, true); if (followMethodNodeList.Size() == 1) { NodeList methodNodeList = followMethodNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag))); if (methodNodeList.Size() == 1) { ATag methodNode = (ATag)methodNodeList[0]; fan.FollowMethod = methodNode.StringText.Trim(); } else { Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的数量出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的标准出错!"); } #endregion fansList.Add(fan); } }
/// <summary> /// 从网页版微博中获取微博信息 /// </summary> /// <param name="fansList">保存爬得的粉丝数组</param> public void GetInfoFromHtml(List <Fan> fansList) { Lexer lexer = new Lexer(currentHtmlContent); Parser parser = new Parser(lexer); //获取包含每条微博的div标记列表 NodeList fansNodeList = parser.Parse(fanFilter); for (int i = 0; i < fansNodeList.Size(); i++) { Fan fan = new Fan(); //获取包含一个粉丝的<li>标记 Bullet fanBullet = (Bullet)fansNodeList[i]; #region 获取该粉丝头像 NodeList fanPortraitNodeList = fanBullet.Children.ExtractAllNodesThatMatch(portraitFilter, true); if (fanPortraitNodeList.Size() == 1) { Div fanPortraitDiv = (Div)fanPortraitNodeList[0]; NodeList imgNodeList = fanPortraitDiv.Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ImageTag)), true); if (imgNodeList.Size() == 1) { ImageTag imgNode = (ImageTag)imgNodeList[0]; if (imgNode.Attributes.ContainsKey("SRC") && imgNode.Attributes.ContainsKey("ALT")) { string imgUrl = imgNode.GetAttribute("SRC"); string imgName = imgNode.GetAttribute("ALT"); fan.Name = imgName; WebClient wc = new WebClient();//使用WebClient是因为下载用户头像不用登录cookie wc.DownloadFileAsync(new Uri(imgUrl), @"portrait\" + imgName + ".jpg"); wc.DownloadFileCompleted += wc_DownloadFileCompleted; } else { Console.WriteLine("第" + i + "个粉丝中,<img>标记缺少必要的属性!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取img标记出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝头像的标准出错!"); } #endregion #region 获取该粉丝的关注数/粉丝数/微博数 NodeList fanConnectNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanConnectFilter, true); if (fanConnectNodeList.Size() == 1) { NodeList ATagList = fanConnectNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true); if (ATagList.Size() == 3) { for (int j = 0; j < 3; j++) { ATag aTag = (ATag)ATagList[j]; switch (j) { case 0: if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("follow")) { fan.FollowCount = Int32.Parse(aTag.StringText); } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝的关注数出错!"); } break; case 1: if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("fans")) { fan.FansCount = Int32.Parse(aTag.StringText); } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝的粉丝数出错!"); } break; default: fan.FeedsCount = Int32.Parse(aTag.StringText); break; } } } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的数量出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的标准出错!"); } #endregion #region 获取该粉丝的简介信息 NodeList fanInfoNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanInfoFilter, true); if (fanInfoNodeList.Size() == 1) { //Console.WriteLine(fanInfoNodeList[0].Parent.ToHtml()); Div fanInfoDiv = (Div)fanInfoNodeList[0]; string intro = fanInfoDiv.StringText; if (intro.Substring(0, 2).Equals("简介")) { fan.Introduction = intro.Substring(3, intro.Length - 3).Replace("\n", " ").Replace("\t", " "); } } else { if (fanInfoNodeList.Size() == 0) { fan.Introduction = ""; } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝简介的标准出错!"); } } #endregion #region 获取该粉丝的UserID、地点和性别信息;校验该粉丝的用户名信息 NodeList fanLocationNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanNameFilter, true); if (fanLocationNodeList.Size() == 1) { //获取粉丝的UserID信息;校验该粉丝的用户名信息 NodeList aTagNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true); if (aTagNodeList.Size() >= 1) { ATag nameNode = (ATag)aTagNodeList[0]; if (nameNode.Attributes.ContainsKey("USERCARD") && nameNode.Attributes.ContainsKey("HREF")) { //获取粉丝的UserID信息 string uidStr = nameNode.GetAttribute("USERCARD"); if (uidStr.Substring(0, 3).Equals("id=")) { fan.UserID = uidStr.Substring(3, uidStr.Length - 3); } //获取粉丝的微博链接 string linkUrl = nameNode.GetAttribute("HREF"); fan.LinkURL = "http://www.weibo.com" + linkUrl; } else { Console.WriteLine("第" + i + "个粉丝中,包含用户id和链接的<a>标记中缺少必要的属性!"); } //校验该粉丝的用户名信息 if (!nameNode.StringText.Equals(fan.Name)) { Console.WriteLine("第" + i + "个粉丝中,用户名与用户头像文字描述不一致!"); } } //获取粉丝的性别和地点信息 NodeList locationNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "addr"), true); if (locationNodeList.Size() == 1) { string locationStr = ""; for (int j = 0; j < locationNodeList[0].Children.Size(); j++) { INode node = locationNodeList[0].Children[j]; if (node.GetType().Equals(typeof(TextNode))) { TextNode tNode = (TextNode)node; locationStr += tNode.ToPlainTextString(); } if (node.GetType().Equals(typeof(TagNode))) { TagNode tNode = (TagNode)node; if (tNode.Attributes.ContainsKey("CLASS")) { if (tNode.GetAttribute("CLASS").Contains("female"))//必须先female,因为female中也含有male,如果male在前,则所有用户均符合该条件了= = { fan.Gender = "female"; } else { if (tNode.GetAttribute("CLASS").Contains("male")) { fan.Gender = "male"; } else { fan.Gender = "unknown"; Console.WriteLine("第" + i + "个粉丝性别不明!"); } } } } } fan.Location = locationStr.Trim(); } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝地点的标准出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取该粉丝的UserID、地点和性别信息的标准出错!"); } #endregion #region 获取该粉丝关注用户的方式 NodeList followMethodNodeList = fanBullet.Children.ExtractAllNodesThatMatch(followMethodFilter, true); if (followMethodNodeList.Size() == 1) { NodeList methodNodeList = followMethodNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag))); if (methodNodeList.Size() == 1) { ATag methodNode = (ATag)methodNodeList[0]; fan.FollowMethod = methodNode.StringText.Trim(); } else { Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的数量出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的标准出错!"); } #endregion fansList.Add(fan); } }