public async static void Start() { while (true) { var waittime = 24 * 60 * 60 * 1000; try { var crawler = new v_crawler(); using (var db = new BizDataContext()) { //从数据库获取抓取间隔 var temp = db.Set<T_Configuration>().Where(p => p.Configuration_Key == CommonHelper.WEATHERKEY).FirstOrDefault(); if (temp != null) { waittime = Int32.Parse(temp.Configuration_Value); } //从数据库获取抓取服务的配置 crawler = await db.Set<T_HTZ_CrawlerService>().Where(p => p.State == v_common.YesState && p.ServiceType == (int)HTZ_CrawlerService_ServiceTypeEnum.Weather && p.IsEnable.Value).Select(p => new v_crawler { id = p.HTZ_CrawlerService_Id, infotype = p.InfoType ?? 0, name = p.HTZ_CrawlerService_Name, xmlfile = p.XMLFilePath, crawlertype = (int)HTZ_CrawlerService_ServiceTypeEnum.Weather }).FirstOrDefaultAsync(); } //开始天气抓取 WeatherCrawler(crawler); //开始生活指数抓取 LivingIndexCrawler(crawler); //记录服务状态 await CommonHelper.SaveNewState((int)HTZ_ServiceState_ServiceStateEnum.Fine, crawler.id); } catch (CrawlerException ex) { await CommonHelper.SaveException(ex); } catch (Exception ex) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "天气抓取服务错误", exceptionmessage = ex.Message, statuscode = 501, serviceid = 2 }; await CommonHelper.SaveException(e); } finally { Thread.Sleep(waittime); } } }
/// <summary> /// 抓取天气信息 /// </summary> public static void WeatherCrawler(v_crawler crawler) { //获取xml配置文件 var cfg = new XmlDocument(); cfg.Load(crawler.xmlfile); var rootNode = cfg.SelectSingleNode("data"); if (rootNode == null) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "配置文件出错", exceptionmessage = "未找到主配置项data", statuscode = 500, serviceid = crawler.id }; throw e; } var tideCrawler = new SimpleCrawler();//新建一个爬虫 //抓取错误的处理 tideCrawler.OnError += (s, e) => { var ex = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "抓取出错", exceptionmessage = e.Exception.Message, statuscode = 500, serviceid = crawler.id }; throw ex; }; //抓取成功后的解析 tideCrawler.OnCompleted += async (s, e) => { try { using (var db = new BizDataContext()) { await SaveWeekData(e.PageSource, rootNode, db); } } catch (Exception ex) { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "解析出错", exceptionmessage = ex.Message, statuscode = 500, serviceid = crawler.id }; throw ee; } }; //获取抓取url var url = rootNode.Attributes["url"].Value; //获取编码格式 var encode = "utf-8"; if (rootNode.Attributes["encode"] != null) { encode = rootNode.Attributes["encode"].Value; } //启动抓取 if (!string.IsNullOrEmpty(url)) { tideCrawler.Start(new Uri(url), encode).Wait(); } }
/// <summary> /// 抓取生活指数信息 /// </summary> public static void LivingIndexCrawler(v_crawler crawler) { var cfg = new XmlDocument(); cfg.Load(crawler.xmlfile); var rootNode = cfg.SelectSingleNode("data"); if (rootNode == null) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "配置文件出错", exceptionmessage = "未找到主配置项data", statuscode = 500, serviceid = crawler.id }; throw e; } var livingIndexCrawler = new SimpleCrawler();//调用刚才写的爬虫程序 livingIndexCrawler.OnError += (s, e) => { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "生活指数抓取出错", exceptionmessage = e.Exception.Message, statuscode = 500, serviceid = crawler.id }; throw ee; }; livingIndexCrawler.OnCompleted += async (s, e) => { try { using (var db = new BizDataContext()) { await SaveLivingIndexData(e.PageSource, rootNode, db); } } catch (Exception ex) { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "详情解析出错", exceptionmessage = ex.Message, statuscode = 500, serviceid = crawler.id }; throw ee; } }; var url = rootNode.SelectSingleNode("LivingIndexConfig").Attributes["url"].Value; var encode = "utf-8"; if (rootNode.Attributes["encode"] != null) { encode = rootNode.Attributes["encode"].Value; } if (!string.IsNullOrEmpty(url)) { livingIndexCrawler.Start(new Uri(url), encode).Wait(); } }
/// <summary> /// 新闻抓取方法 /// </summary> /// <param name="infotype">抓取的新闻栏目类型</param> /// <param name="infoDoc"></param> public async static void InfomationCrawler(v_crawler crawler) { await Task.Run(async() => { try { var xmlDocument = new XmlDocument(); xmlDocument.Load(crawler.xmlfile); var rootNode = xmlDocument.SelectSingleNode("data"); if (rootNode == null) { throw new CrawlerException(crawler.id, "配置文件错误", "未找到主配置项data"); } crawler.encode = DefaultEncode; if (rootNode.Attributes["encode"] != null) { crawler.encode = rootNode.Attributes["encode"].Value; } crawler.url = rootNode.Attributes["url"].Value; var infoCrawler = new SimpleCrawler();//新建一个抓取服务 infoCrawler.OnError += (s, e) => { throw new CrawlerException(crawler.id, "获取页面代码时错误", e.Exception.Message); }; infoCrawler.OnCompleted += async(s, e) => { try { using (var db = new BizDataContext()) { SaveInfomation(e.PageSource, crawler, xmlDocument, db); await CommonHelper.SaveNewState((int)HTZ_ServiceState_ServiceStateEnum.Fine, crawler.id); } } catch (Exception ex) { var messageException = new CrawlerException(crawler.id, "解析出错", ex.Message); await SaveException(messageException); } }; if (!string.IsNullOrEmpty(crawler.url)) { infoCrawler.Start(new Uri(crawler.url), crawler.encode).Wait(); } } catch (CrawlerException ex) { await CommonHelper.SaveException(ex); } catch (Exception ex) { var e = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "信息抓取服务错误", exceptionmessage = ex.Message, statuscode = 501, serviceid = 2 }; await CommonHelper.SaveException(e); } }); }
/// <summary> /// 解析数据 /// </summary> /// <param name="html">原始html</param> /// <param name="crawler">抓取服务配置</param> /// <param name="infoNode">xml配置数据</param> /// <param name="db">数据库链接</param> /// <param name="encode">html编码格式</param> /// <param name="url">抓取原始url</param> /// <returns></returns> public static void SaveInfomation(string html, v_crawler crawler, XmlDocument infoNode, BizDataContext db) { //页面数据的根节点 var rootNode = CommonHelper.GetRootNode(html); //配置数据的根节点 var xmlRoot = infoNode.SelectSingleNode("data"); //获取信息列表的配置节点 var listConfig = xmlRoot.SelectSingleNode("ListConfig"); //这里是为了最终找到modelListNodes变量的值,也就是新闻列表 //在此之前可能需要多次的剥离无效数据,因此使用了foreach循环 //正常流程应该是多次SelectSingleNode后,进行一次SelectNodes,获取到新闻列表 HtmlNode modelNode = null; HtmlNodeCollection modelListNodes = null; foreach (XmlNode item in listConfig.ChildNodes) { if (modelNode == null) { modelNode = rootNode; } if (item.Attributes["issingleselect"].Value.ToBool()) { //多次剥离无效数据 modelNode = modelNode.SelectSingleNode(item.Attributes["signstr"].Value); } else { //最终获取到信息列表,此时应该循环结束 modelListNodes = modelNode.SelectNodes(item.Attributes["signstr"].Value); break; } } //获取对新闻实体解析的配置节点 var infoConfig = xmlRoot.SelectSingleNode("InfoConfig"); //对上面获取到的新闻列表循环处理 foreach (HtmlNode info in modelListNodes) { T_Information entity = new T_Information(); var detailUrl = string.Empty; //新闻解析应该包含多个子节点,每个子节点表示一个属性,这里进行循环赋值 foreach (XmlNode property in infoConfig.ChildNodes) { if (property.Name == "property") { entity = CommonHelper.GetProperty(entity, info, property); } else if (property.Name == "DetailUrl") { detailUrl = GetUrl(info, property, crawler.url); } } var count = db.Set <T_Information>().Where(p => p.Information_Id == entity.Information_Id).Select(p => 1).Count(); if (count >= 1) { return; } entity.State = (int)T_InformationStateEnum.Publish; entity.InformationType = (int)t_informationtypeenum.customcontent; entity.InfoTypeIds = crawler.infotype + ","; if (!string.IsNullOrEmpty(detailUrl)) { //循环赋值完成后,前往新闻详情页获取新闻详情,完善新闻实体 InfomationDetailCrawler(detailUrl, xmlRoot, entity, crawler, crawler.encode); } entity.OriginalSourceUrl = detailUrl; } }
private async static Task SaveInfomationDetail(string pageSource, T_Information info, XmlNode infoNode, BizDataContext db, v_crawler crawler, string detailUrl) { var rootNode = CommonHelper.GetRootNode(pageSource); if (info != null) { var xmlRoot = infoNode.SelectSingleNode("DataDetail"); var detailHost = new Regex(@"\S+/").Match(detailUrl).Value; //新闻解析应该包含多个子节点,每个子节点表示一个属性,这里进行循环赋值 foreach (XmlNode property in xmlRoot.ChildNodes) { info = CommonHelper.GetProperty(info, rootNode, property, detailHost); } var info_infotag = new T_InfoType_Information() { InfoType_Information_Id = await db.GetNextIdentity_IntAsync(), CreateTime = DateTime.Now, InformationId = info.Information_Id, InfoTypeId = crawler.infotype, }; var informationcontent = new T_Information_Content() { Information_Content_Id = await db.GetNextIdentity_IntAsync(), Conent = info.Content, ContentType = (int)Information_Content_ContentTypeEnum.TextWords, OrderIndex = 0, InformationId = info.Information_Id, State = 0, CreateTime = DateTime.Now, }; info.ClassifyId = (int)InfoSource.Crawler; info.PublishTime = info.CreateTime; info.IsTop = false; if (!string.IsNullOrEmpty(informationcontent.Conent)) { var regex = new Regex("<img.*?>"); var imgMtach = regex.Match(info.Content); if (imgMtach.Success) { var img = imgMtach.Value; var srcMatch = new Regex("src=\".*?\"").Match(img); if (srcMatch.Success) { var src = srcMatch.Value; var att = new T_Attachment() { Attachment_ID = await db.GetNextIdentity_IntAsync(), FilePath = src, State = 0, CreateTime = DateTime.Now, }; await db.InsertAsync(att); } } await db.InsertAsync(info); await db.InsertAsync(info_infotag); await db.InsertAsync(informationcontent); } } }
/// <summary> /// 抓取青网本地信息 /// </summary> public static void InfomationDetailCrawler(string url, XmlNode infoNode, T_Information info, v_crawler crawler, string encode) { var infoDetailCrawler = new SimpleCrawler();//新建一个爬虫服务 infoDetailCrawler.OnError += (s, e) => { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "详情抓取出错", exceptionmessage = e.Exception.Message, statuscode = 500, serviceid = crawler.id }; throw ee; }; infoDetailCrawler.OnCompleted += async(s, e) => { try { using (var db = new BizDataContext()) { await SaveInfomationDetail(e.PageSource, info, infoNode, db, crawler, url); } } catch (Exception ex) { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "详情解析出错", exceptionmessage = ex.Message, statuscode = 500, serviceid = crawler.id }; await CommonHelper.SaveException(ee); } }; infoDetailCrawler.Start(new Uri(url), encode).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }