/// <summary> /// 网页开始抓取的入口 /// </summary> /// <returns></returns> public ActionResult StartCatch() { //实例化StreamReader var readerInput = new StreamReader(Request.InputStream); var javaser = new JavaScriptSerializer(); string s = readerInput.ReadToEnd(); var data = javaser.Deserialize <webUrl>(s); //把url网址转换成UTF-8编码格式 string url = HttpUtility.UrlDecode(data.url, Encoding.UTF8); foreach (var item in webRule_list) { //判断是否包含在数据库规则中 if (url.StartsWith(item.UrlWeb)) { if (item.Response == "json") { string datastr = ""; string method = "POST"; CatchHelper cathelp = new CatchHelper(); //判断是否是国家预警信息网 if (item.WebName == "alarm") { JObject jobect = Static.GetJson(item.RequestUrl, "GET", null); if (jobect == null) { return(null); } string path = Server.MapPath(System.Web.HttpContext.Current.Request.ApplicationPath.ToString()); FileStream file = new FileStream(path + @"Content\ThesaurusURL\city.txt", FileMode.Open, FileAccess.Read); StreamReader cityreader = new StreamReader(file, Encoding.GetEncoding("GBK")); string citystr = cityreader.ReadToEnd().Replace("\r", " ").Replace("\n", " "); List <string> citylist = citystr.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).ToList(); List <BasicAttr> listbasic = new List <BasicAttr>(); foreach (var alarmitem in jobect["alertData"]) { BasicAttr basic = new BasicAttr(); basic.AttrName = alarmitem["headline"].ToString(); basic.Start_Time = basic.End_Time = alarmitem["sendTime"].ToString(); foreach (var city in citylist) { if (basic.AttrName.Contains(city)) { basic.Holding_City = city; if (city.Contains("市") || city.Contains("州") || city.Contains("区") || city.Contains("县")) { basic.Is_Influence_City = true; } else { basic.Is_Influence_Province = true; } break; } } listbasic.Add(basic); } db.BasicAttr.AddRange(listbasic); db.SaveChanges(); return(Json(new { ExecuteResult = listbasic.Count, Success = "fail" })); } if (item.WebName == "damai") { if (url.Contains("演唱会")) { datastr = "ctl=演唱会&currPage="; } if (url.Contains("体育比赛")) { datastr = "ctl=体育比赛&currPage="; } for (int i = 1; i < 100; i++) { //获取请求链接返回的json对象 JObject jobject = Static.GetJson(item.RequestUrl, method, datastr + i); List <Concert> listconvert = new List <Concert>(); //把jobject对象数据整理到数据库 if (jobject != null) { listconvert = cathelp.FConcertJsonToList(jobject, item.WebName); } else { return(Json(new { ExecuteResult = "基础连接已经关闭", Success = "fail" })); } //listconvert为空代表数据已经抓完 if (listconvert == null) { return(Json(new { ExecuteResult = "success", Success = "success" })); } } } if (item.WebName == "yongle") { datastr = "j=1&p="; method = "GET"; item.RequestUrl = url; for (int i = 1; i < 100; i++) { //获取请求链接返回的json对象 JObject jobject = Static.GetJson(item.RequestUrl, method, datastr + i); List <Concert> listconvert = new List <Concert>(); //把jobject对象数据整理到数据库 if (jobject != null) { listconvert = cathelp.FConcertJsonToList(jobject, item.WebName); } else { return(Json(new { ExecuteResult = "基础连接已经关闭", Success = "fail" })); } //listconvert为空代表数据已经抓完 if (listconvert == null) { return(Json(new { ExecuteResult = "success", Success = "success" })); } } } } if (item.Response == "html") { //判断是否是E展会 if (item.WebName == "E") { //实例化对象 WaitCatchLink = new Queue <string>(); CatchedLink = new List <string>(); //获取编码 string encoding = Static.GetEncoding(url); //如果获取源码失败则默认使用UTF-8编码 if (encoding == null) { encoding = "UTF-8"; } //获取网页源码 string html = Static.GetHtml(item.RequestUrl, Encoding.GetEncoding(encoding), "POST", "serarchwhere={1:1}&SearType=1&page=1"); if (html == null) { return(Json(new { ExecuteResult = "fail", success = "fail" })); } //获取该抓取的网页页数 int page = Static.GetPageCount(html, item.PageXpath); for (int i = 1; i < page; i++) { //获取网页源码 html = Static.GetHtml(item.RequestUrl, Encoding.GetEncoding(encoding), "POST", "serarchwhere={1:1}&SearType=1&page=" + i); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //获取指定Xpath路径的所有链接 HtmlNodeCollection nodecollection = doc.DocumentNode.SelectSingleNode(item.WebContentXpath).SelectNodes(".//a[@href]"); if (nodecollection == null) { continue; } foreach (HtmlNode nodelink in nodecollection) { //获取集合中的一条链接 string newlink = nodelink.Attributes["href"].Value; //判断链接的有效性 if (newlink == "" || newlink == "#" || newlink == "javascript") { continue; } //补全子链接 if (newlink.StartsWith("/") && !newlink.StartsWith("//")) { newlink = item.UrlWeb.TrimEnd('/') + newlink; } //判断该链接是否已经抓取过或者已经在等待抓取队列中 if (newlink.StartsWith(item.UrlWeb) && !WaitCatchLink.Contains(newlink) && !CatchedLink.Contains(newlink)) { WaitCatchLink.Enqueue(newlink); } } //实例化基础属性类 List <BasicAttr> listbasic = new List <BasicAttr>(); while (WaitCatchLink.Count > 0) { //取出等待抓取链接队列中的第一条链接并移除 string catchlink = WaitCatchLink.Dequeue(); //把次链接添加到已经抓取的集合中 CatchedLink.Add(catchlink); //获取此链接的源码 string catchhtml = Static.GetHtml(catchlink, Encoding.GetEncoding(encoding), null, null); //如果源码没有抓取成功则跳过此链接 if (catchhtml == null) { continue; } doc = new HtmlDocument(); doc.LoadHtml(catchhtml); //根据给定的Xpath路径获取内容节点 HtmlNode contentnode = doc.DocumentNode.SelectSingleNode(item.AttrContentXpath); if (contentnode == null) { continue; } AnalysisHelper analy = new AnalysisHelper(); string serverpath = Server.MapPath(System.Web.HttpContext.Current.Request.ApplicationPath.ToString()); BasicAttr ba = analy.Analysis(contentnode.InnerText, serverpath); listbasic.Add(ba); } //把处理好的事件属性添加到数据库中 db.BasicAttr.AddRange(listbasic); //保存数据 db.SaveChanges(); } return(Json(new { ExecuteResult = CatchedLink.Count, Success = "success" })); } //判读是否是中国会展门户 if (item.WebName == "cnena") { DateTime dt = DateTime.Now; //实例化对象 WaitCatchLink = new Queue <string>(); CatchedLink = new List <string>(); string encoding = Static.GetEncoding(url); //如果获取源码失败则默认使用UTF-8编码 if (encoding == null) { encoding = "UTF-8"; } //默认抓取未来七天内开始的展会 DateTime dtseven = dt.AddDays(7); //用于判断该链接的内容是否在指定的抓取的时间段内 bool Isoverdue = false; int[] Monthtime; //判断是否应该抓取下一个月的数据 if (dtseven.Month > dt.Month) { Monthtime = new int[] { dt.Month, dtseven.Month } } ; else { Monthtime = new int[] { dt.Month } }; foreach (var month in Monthtime) { for (int i = 1; i < 100; i++) { string requesturl = item.RequestUrl + "?daytime=" + month + "&page=" + i; string html = Static.GetHtml(requesturl, Encoding.GetEncoding(encoding), null, null); //判断是否成功获取网页源码 if (html == null) { continue; } HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //根据给定的Xpath路径获取内容节点 HtmlNode hn = doc.DocumentNode.SelectSingleNode(item.WebContentXpath); //判断是否成功获取该节点 if (hn == null) { continue; } foreach (var cnenatable in hn.SelectNodes("table")) { Match math = Regex.Match(cnenatable.InnerText, @"(\d{4}|\d{2})(\-|\/|\\)\d{1,2}(\-|\/|\\)\d{1,2}(\s?\d{2}:\d{2})?", RegexOptions.IgnoreCase); if (math.Success) { DateTime dd = Convert.ToDateTime(math.Groups[0].Value).Date; if (dd > dt && dd < dtseven) { foreach (var cnenalink in cnenatable.SelectNodes(".//td[2]")) { string linktext = item.UrlWeb.TrimEnd('/') + "/" + cnenalink.SelectSingleNode(".//a[@href]").Attributes["href"].Value; if (linktext == "") { continue; } if (!WaitCatchLink.Contains(linktext) && !CatchedLink.Contains(linktext)) { WaitCatchLink.Enqueue(linktext); continue; } } } if (dd < dt) { Isoverdue = true; break; } } } List <BasicAttr> listbasic = new List <BasicAttr>(); while (WaitCatchLink.Count > 0) { //取出等待抓取链接队列中的第一条链接并移除 string catchlink = WaitCatchLink.Dequeue(); //把次链接添加到已经抓取的集合中 CatchedLink.Add(catchlink); //获取此链接的源码 string catchhtml = Static.GetHtml(catchlink, Encoding.GetEncoding(encoding), null, null); //如果源码没有抓取成功则跳过此链接 if (catchhtml == null) { continue; } doc = new HtmlDocument(); doc.LoadHtml(catchhtml); //根据给定的Xpath路径获取内容节点 HtmlNode contentnode = doc.DocumentNode.SelectSingleNode(item.AttrContentXpath); if (contentnode == null) { continue; } AnalysisHelper analy = new AnalysisHelper(); string serverpath = Server.MapPath(System.Web.HttpContext.Current.Request.ApplicationPath.ToString()); BasicAttr ba = analy.Analysis(contentnode.InnerText, serverpath); listbasic.Add(ba); } if (Isoverdue) { break; } } } return(Json(new { ExecuteResult = CatchedLink.Count, Success = "success" })); } //判读是否中国天气预警 if (item.WebName == "tianqialarm") { //实例化对象 WaitCatchLink = new Queue <string>(); CatchedLink = new List <string>(); //获取网页编码 string encoding = Static.GetEncoding(url); //如果获取源码失败则默认使用UTF-8编码 if (encoding == null) { encoding = "UTF-8"; } //用于判断该链接的内容是否在指定的抓取的时间段内 bool Isoverdue = false; for (int i = 1; i < 100; i++) { //补全请求链接 string requesturl = item.RequestUrl.TrimEnd('/') + @"/" + i + @"/"; //获取网页编码 string html = Static.GetHtml(requesturl, Encoding.GetEncoding(encoding), null, null); //判断是否成功获取网页源码 if (html == null) { continue; } HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //根据给定的Xpath路径获取内容节点 HtmlNode hn = doc.DocumentNode.SelectSingleNode(item.WebContentXpath); //判断是否成功获取该节点 if (hn == null) { continue; } //存放网页所有的时间 List <DateTime> listtime = new List <DateTime>(); //匹配该网页中所有符合条件的时间存到集合match中 MatchCollection match = Regex.Matches( hn.InnerText, @"(\d{4}|\d{2})(\-|\/|\\)\d{1,2}(\-|\/|\\)\d{1,2}(\s?\d{2}:\d{2})?", RegexOptions.IgnoreCase); if (match.Count > 0) { string dateStr = ""; for (int j = 0; j < match.Count; j++) { dateStr = match[j].Value; try { listtime.Add(Convert.ToDateTime(dateStr)); } catch { continue; } } } //获取该节点的链接存到集合hc中 HtmlNodeCollection hc = hn.SelectNodes(".//a[@href]"); for (int n = 0; n < hc.Count; n++) { //获取第n个链接 string tianqilink = hc[n].Attributes["href"].Value; //获取当前时间 DateTime dt = DateTime.Now.Date; //判断该链接的日期是否是在当前日期段内 if (listtime[n].Date >= dt) { if (!WaitCatchLink.Contains(tianqilink) && !CatchedLink.Contains(tianqilink)) { WaitCatchLink.Enqueue(tianqilink); } else { continue; } } else { Isoverdue = true; //已经超过指定的时间段 } } List <BasicAttr> listbasic = new List <BasicAttr>(); while (WaitCatchLink.Count > 0) { //取出等待抓取链接队列中的第一条链接并移除 string catchlink = WaitCatchLink.Dequeue(); //把次链接添加到已经抓取的集合中 CatchedLink.Add(catchlink); //获取此链接的源码 string catchhtml = Static.GetHtml(catchlink, Encoding.GetEncoding(encoding), null, null); //如果源码没有抓取成功则跳过此链接 if (catchhtml == null) { continue; } doc = new HtmlDocument(); doc.LoadHtml(catchhtml); //根据给定的Xpath路径获取内容节点 HtmlNode contentnode = doc.DocumentNode.SelectSingleNode(item.AttrContentXpath); if (contentnode == null) { continue; } AnalysisHelper analy = new AnalysisHelper(); string serverpath = Server.MapPath(System.Web.HttpContext.Current.Request.ApplicationPath.ToString()); BasicAttr ba = analy.Analysis(contentnode.InnerText, serverpath); listbasic.Add(ba); } //判断是否已经超过指定的时间段 if (Isoverdue) { break; } } return(Json(new { ExecuteResult = CatchedLink.Count, Success = "success" })); } } } } return(Json(new { ExecuteResult = "该链接未包含在规则表中", Success = "success" })); }