/// <summary> /// 获取一只携带完整信息的爬虫 /// </summary> /// <param name="Spider"></param> /// <returns></returns> async public Task<SchoolInfoSpider> GetSpiderByName(SchoolInfoSpider Spider) { string content = await WebMsg(Spider); if (content == null) //网络故障, 没有获取到网页 { Spider.Tag = 1; } else //网络正常 { if (Spider.Url == "http://bjtu.edu.cn") //学校 { int start = content.IndexOf("输入您所要查找的关键词"); int end = content.LastIndexOf("allowScriptAccess"); var source = content.Substring(start, end - start); Spider.MC = Regex.Matches(source, Spider.RegexStr, RegexOptions.Singleline); if (Spider.MC.Count >= 1) // 万一学校网页变化 { AddResultNotFulllink(Spider); } else Spider.Tag = 2; //学校网页变化正则无法匹配 } else if (Spider.Url != "http://scit.bjtu.edu.cn") //正常学院 { Spider.MC = Regex.Matches(content, Spider.RegexStr, RegexOptions.Singleline); if (Spider.MC.Count >= 1) // 万一学校网页变化 { AddResultFullLink(Spider); } else Spider.Tag = 2; //学校网页变化正则无法匹配 } else //就是你了,计算机学院! { int start, end; if (Spider.Name == "计算机本科生") { start = content.IndexOf("本科生通知控件"); end = content.LastIndexOf("本科生通知面板结束"); } else { start = content.IndexOf("研究生通知控件"); end = content.LastIndexOf("研究生通知面板结束"); } string source = content.Substring(start, end - start); Spider.MC = Regex.Matches(source, Spider.RegexStr, RegexOptions.Singleline); if (Spider.MC.Count >= 1) // 万一学校网页变化 { AddResultScitLink(Spider); } else Spider.Tag = 2; //学校网页变化正则无法匹配 } } return Spider; }
/// <summary> /// 获取网页源代码 /// </summary> /// <param name="sis"></param> /// <returns></returns> async public Task<string> WebMsg(SchoolInfoSpider sis) { try { using (HttpClient client = new HttpClient(new HtmlTextHandler())) { string contentOfWeb = await client.GetStringAsync(sis.Url); return contentOfWeb; } } catch (Exception e) { return null; } }
/// <summary> /// 将爬虫信息添加完整, 给计算机学院使用, 日了狗了, 一个学院折腾我一天 /// </summary> /// <param name="sis"></param> private void AddResultScitLink(SchoolInfoSpider sis) { //string pattern = "&"; foreach (Match mcItem in sis.MC) { sis.TimeList.Add(mcItem.Groups["time"].ToString()); sis.LinkList.Add(sis.ExtraUrl + mcItem.Groups["link"].ToString().Replace("&", "&")); sis.ChineseCharList.Add(mcItem.Groups["hanzi"].ToString()); } }
/// <summary> /// 将爬虫信息添加完整, 给时间与链接各自独立情况使用 /// </summary> /// <param name="sis">爬虫</param> private void AddResultFullLink(SchoolInfoSpider sis) { foreach (Match mcItem in sis.MC) { sis.TimeList.Add(mcItem.Groups["time"].ToString()); sis.LinkList.Add(sis.ExtraUrl + mcItem.Groups["link"].ToString()); sis.ChineseCharList.Add(mcItem.Groups["hanzi"].ToString()); } }
/// <summary> /// 将爬虫信息添加完整, 给时间包含在链接里面的情况使用 /// </summary> /// <param name="sis">爬虫</param> private void AddResultNotFulllink(SchoolInfoSpider sis) { foreach (Match mcItem in sis.MC) { sis.FrontLinkList.Add(mcItem.Groups["frontlink"].ToString()); sis.TimeList.Add(mcItem.Groups["time"].ToString()); sis.BackLinkList.Add(mcItem.Groups["backlink"].ToString()); sis.LinkList.Add(sis.FrontLinkList.Last() + sis.TimeList.Last() + sis.BackLinkList.Last()); sis.ChineseCharList.Add(mcItem.Groups["hanzi"].ToString()); } }
/// <summary> /// 经管学院 /// </summary> /// <returns></returns> public SchoolInfoSpider Sem1() { SchoolInfoSpider Spider = new SchoolInfoSpider(); Spider.Url = "http://sem.bjtu.edu.cn/boardmore_v3.0.asp?selectclass=%BD%CC%D1%A7%BF%C6&partFlag=true"; Spider.ExtraUrl = "http://sem.bjtu.edu.cn/"; Spider.RegexStr = "(?<link>boardread.*?id=\\d{5})\".*?title=\"(?<hanzi>.*?)\".*?news_items_time\"[>](?<time>.*?)[<]/div[>]"; Spider.Encoding = "GB2312"; return Spider; }
/// <summary> /// 计算机学院研究生信息 /// </summary> /// <returns></returns> public SchoolInfoSpider Scit7() { SchoolInfoSpider Scit7Spider = new SchoolInfoSpider(); Scit7Spider.Url = "http://scit.bjtu.edu.cn"; Scit7Spider.ExtraUrl = "http://scit.bjtu.edu.cn/"; Scit7Spider.RegexStr = "title=\"(?<hanzi>.*?)\".*?\"(?<link>ShowNews.*?page=7).*?[[](?<time>.*?)[]]"; Scit7Spider.Encoding = "UTF-8"; return Scit7Spider; }
/// <summary> /// 电信学院研究生信息 /// </summary> /// <returns></returns> public SchoolInfoSpider Eaie47() { SchoolInfoSpider Spider = new SchoolInfoSpider(); Spider.Name = "电信研究生"; Spider.Url = "http://eaie.bjtu.edu.cn/list.php?cid=47"; Spider.ExtraUrl = "http://eaie.bjtu.edu.cn/"; Spider.RegexStr = "[<]span[>][(](?<time>.*?)[)][<]/span[>].*?(?<link>view.*?cid=47).*?title=\"(?<hanzi>.*?)\"[>]"; Spider.Encoding = "utf-8"; return Spider; }
/// <summary> /// 运输学院研究生信息 /// </summary> /// <returns></returns> public SchoolInfoSpider Trans36() { SchoolInfoSpider Spider = new SchoolInfoSpider(); Spider.Name = "运输研究生"; Spider.Url = "http://trans.bjtu.edu.cn/list.php?cid=36"; Spider.ExtraUrl = "http://trans.bjtu.edu.cn/"; Spider.RegexStr = "(?<link>view.*?cid=36).*?title=\"(?<hanzi>.*?)\".*?[<]span[>](?<time>.*?)[<]/span[>]"; Spider.Encoding = "utf-8"; return Spider; }
/// <summary> /// 学校信息 /// </summary> /// <returns></returns> public SchoolInfoSpider BJTU() { SchoolInfoSpider Spider = new SchoolInfoSpider(); Spider.Name = "学校"; Spider.Url = "http://bjtu.edu.cn"; Spider.ExtraUrl = ""; Spider.RegexStr = "(?<frontlink>http://news.bjtu.edu.cn/.*?/)(?<time>[0-9].*?)(?<backlink>/.*?html).*?vTitle=\'(?<hanzi>.{5,30})\'"; Spider.Encoding = "gb2312"; return Spider; }