public async static void Start() { while (true) { var waittime = 24 * 60 * 60 * 1000; try { using (var db = new BizDataContext()) { var temp = db.Set <T_Configuration>().Where(p => p.Configuration_Key == CommonHelper.LINKKEY).FirstOrDefault(); if (temp != null) { waittime = Int32.Parse(temp.Configuration_Value); } var services = await db.Set <T_HTZ_ServiceApp>().Where(p => p.State == v_common.YesState).ToListAsync(); foreach (var item in services) { if (item.App_IsEnable ?? false) { LinkValidate(item.HTZ_ServiceApp_Id, item.App_URL, (int)ServiceType.App, item.HTZ_ServiceApp_Name); } if (item.Web_IsEnable ?? false) { LinkValidate(item.HTZ_ServiceApp_Id, item.Web_URL, (int)ServiceType.Web, item.HTZ_ServiceApp_Name); } } } } catch (Exception ex) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.Service, exceptionbrief = "应用服务链路异常", exceptionmessage = ex.Message, statuscode = 501, serviceid = 1 }; await CommonHelper.SaveException(e); } finally { Thread.Sleep(waittime); } } }
/// <summary> /// 抓取青网本地信息 /// </summary> public static void InfomationDetailCrawler(string url, XmlNode infoNode, T_Information info, v_crawler crawler, string encode) { var infoDetailCrawler = new SimpleCrawler();//新建一个爬虫服务 infoDetailCrawler.OnError += (s, e) => { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "详情抓取出错", exceptionmessage = e.Exception.Message, statuscode = 500, serviceid = crawler.id }; throw ee; }; infoDetailCrawler.OnCompleted += async(s, e) => { try { using (var db = new BizDataContext()) { await SaveInfomationDetail(e.PageSource, info, infoNode, db, crawler, url); } } catch (Exception ex) { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "详情解析出错", exceptionmessage = ex.Message, statuscode = 500, serviceid = crawler.id }; await CommonHelper.SaveException(ee); } }; infoDetailCrawler.Start(new Uri(url), encode).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 抓取天气信息 /// </summary> public static void WeatherCrawler(v_crawler crawler) { //获取xml配置文件 var cfg = new XmlDocument(); cfg.Load(crawler.xmlfile); var rootNode = cfg.SelectSingleNode("data"); if (rootNode == null) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "配置文件出错", exceptionmessage = "未找到主配置项data", statuscode = 500, serviceid = crawler.id }; throw e; } var tideCrawler = new SimpleCrawler();//新建一个爬虫 //抓取错误的处理 tideCrawler.OnError += (s, e) => { var ex = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "抓取出错", exceptionmessage = e.Exception.Message, statuscode = 500, serviceid = crawler.id }; throw ex; }; //抓取成功后的解析 tideCrawler.OnCompleted += async (s, e) => { try { using (var db = new BizDataContext()) { await SaveWeekData(e.PageSource, rootNode, db); } } catch (Exception ex) { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "解析出错", exceptionmessage = ex.Message, statuscode = 500, serviceid = crawler.id }; throw ee; } }; //获取抓取url var url = rootNode.Attributes["url"].Value; //获取编码格式 var encode = "utf-8"; if (rootNode.Attributes["encode"] != null) { encode = rootNode.Attributes["encode"].Value; } //启动抓取 if (!string.IsNullOrEmpty(url)) { tideCrawler.Start(new Uri(url), encode).Wait(); } }
public async static void Start() { while (true) { var waittime = 24 * 60 * 60 * 1000; try { var crawler = new v_crawler(); using (var db = new BizDataContext()) { //从数据库获取抓取间隔 var temp = db.Set<T_Configuration>().Where(p => p.Configuration_Key == CommonHelper.WEATHERKEY).FirstOrDefault(); if (temp != null) { waittime = Int32.Parse(temp.Configuration_Value); } //从数据库获取抓取服务的配置 crawler = await db.Set<T_HTZ_CrawlerService>().Where(p => p.State == v_common.YesState && p.ServiceType == (int)HTZ_CrawlerService_ServiceTypeEnum.Weather && p.IsEnable.Value).Select(p => new v_crawler { id = p.HTZ_CrawlerService_Id, infotype = p.InfoType ?? 0, name = p.HTZ_CrawlerService_Name, xmlfile = p.XMLFilePath, crawlertype = (int)HTZ_CrawlerService_ServiceTypeEnum.Weather }).FirstOrDefaultAsync(); } //开始天气抓取 WeatherCrawler(crawler); //开始生活指数抓取 LivingIndexCrawler(crawler); //记录服务状态 await CommonHelper.SaveNewState((int)HTZ_ServiceState_ServiceStateEnum.Fine, crawler.id); } catch (CrawlerException ex) { await CommonHelper.SaveException(ex); } catch (Exception ex) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "天气抓取服务错误", exceptionmessage = ex.Message, statuscode = 501, serviceid = 2 }; await CommonHelper.SaveException(e); } finally { Thread.Sleep(waittime); } } }
/// <summary> /// 抓取生活指数信息 /// </summary> public static void LivingIndexCrawler(v_crawler crawler) { var cfg = new XmlDocument(); cfg.Load(crawler.xmlfile); var rootNode = cfg.SelectSingleNode("data"); if (rootNode == null) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "配置文件出错", exceptionmessage = "未找到主配置项data", statuscode = 500, serviceid = crawler.id }; throw e; } var livingIndexCrawler = new SimpleCrawler();//调用刚才写的爬虫程序 livingIndexCrawler.OnError += (s, e) => { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "生活指数抓取出错", exceptionmessage = e.Exception.Message, statuscode = 500, serviceid = crawler.id }; throw ee; }; livingIndexCrawler.OnCompleted += async (s, e) => { try { using (var db = new BizDataContext()) { await SaveLivingIndexData(e.PageSource, rootNode, db); } } catch (Exception ex) { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "详情解析出错", exceptionmessage = ex.Message, statuscode = 500, serviceid = crawler.id }; throw ee; } }; var url = rootNode.SelectSingleNode("LivingIndexConfig").Attributes["url"].Value; var encode = "utf-8"; if (rootNode.Attributes["encode"] != null) { encode = rootNode.Attributes["encode"].Value; } if (!string.IsNullOrEmpty(url)) { livingIndexCrawler.Start(new Uri(url), encode).Wait(); } }
/// <summary> /// 新闻抓取方法 /// </summary> /// <param name="infotype">抓取的新闻栏目类型</param> /// <param name="infoDoc"></param> public async static void InfomationCrawler(v_crawler crawler) { await Task.Run(async() => { try { var xmlDocument = new XmlDocument(); xmlDocument.Load(crawler.xmlfile); var rootNode = xmlDocument.SelectSingleNode("data"); if (rootNode == null) { throw new CrawlerException(crawler.id, "配置文件错误", "未找到主配置项data"); } crawler.encode = DefaultEncode; if (rootNode.Attributes["encode"] != null) { crawler.encode = rootNode.Attributes["encode"].Value; } crawler.url = rootNode.Attributes["url"].Value; var infoCrawler = new SimpleCrawler();//新建一个抓取服务 infoCrawler.OnError += (s, e) => { throw new CrawlerException(crawler.id, "获取页面代码时错误", e.Exception.Message); }; infoCrawler.OnCompleted += async(s, e) => { try { using (var db = new BizDataContext()) { SaveInfomation(e.PageSource, crawler, xmlDocument, db); await CommonHelper.SaveNewState((int)HTZ_ServiceState_ServiceStateEnum.Fine, crawler.id); } } catch (Exception ex) { var messageException = new CrawlerException(crawler.id, "解析出错", ex.Message); await SaveException(messageException); } }; if (!string.IsNullOrEmpty(crawler.url)) { infoCrawler.Start(new Uri(crawler.url), crawler.encode).Wait(); } } catch (CrawlerException ex) { await CommonHelper.SaveException(ex); } catch (Exception ex) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "信息抓取服务错误", exceptionmessage = ex.Message, statuscode = 501, serviceid = 2 }; await CommonHelper.SaveException(e); } }); }
private static async void LinkValidate(int id, string url, int serviceType, string serviceName) { try { var app = new LinkService(); app.OnCompleted += (async(s, e) => { if (e.StatusCode == HttpStatusCode.OK) { await CommonHelper.SaveNewState((int)HTZ_ServiceState_ServiceStateEnum.Fine, id, (int)e.Milliseconds, serviceType); } else { var ex = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.Service, exceptionbrief = EnumHelper.GetDescription(e.StatusCode), exceptionmessage = EnumHelper.GetDescription(e.StatusCode), statuscode = (int)e.StatusCode, serviceid = id, servicename = serviceName, serverAppType = serviceType }; throw ex; } }); app.OnError += ((s, e) => { var ex = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.Service, exceptionbrief = "请求时出错", exceptionmessage = e.Exception.Message, statuscode = (int)HttpStatusCode.InternalServerError, serviceid = id, servicename = serviceName, serverAppType = serviceType, }; throw ex; }); await app.BeginRequest(new Uri(url)); } catch (CrawlerException ex) { await CommonHelper.SaveException(ex); } catch (Exception ex) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.Service, exceptionbrief = "应用服务链路异常", exceptionmessage = ex.Message, statuscode = 501, serviceid = id, servicename = serviceName }; await CommonHelper.SaveException(e); } }