public ActionResult CreateNewInfo(T_Information model) { model.CreateTime = model.CreateTime ?? DateTime.Now; dbContent.Entry(model).State = EntityState.Added; var file = Request.Files["file_conver"]; model.Conver = SaveFile(file); dbContent.SaveChanges(); return(Ok()); }
public ActionResult ModifyNewInfo(T_Information model) { dbContent.Entry(model).State = EntityState.Modified; var file = Request.Files["file_conver"]; if (file != null && file.ContentLength > 0) { model.Conver = SaveFile(file); } else { dbContent.Entry(model).Property(x => x.Conver).IsModified = false; } dbContent.SaveChanges(); return(Ok()); }
/// <summary> /// 抓取青网本地信息 /// </summary> public static void InfomationDetailCrawler(string url, XmlNode infoNode, T_Information info, v_crawler crawler, string encode) { var infoDetailCrawler = new SimpleCrawler();//新建一个爬虫服务 infoDetailCrawler.OnError += (s, e) => { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "详情抓取出错", exceptionmessage = e.Exception.Message, statuscode = 500, serviceid = crawler.id }; throw ee; }; infoDetailCrawler.OnCompleted += async(s, e) => { try { using (var db = new BizDataContext()) { await SaveInfomationDetail(e.PageSource, info, infoNode, db, crawler, url); } } catch (Exception ex) { var ee = new CrawlerException() { crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab, exceptionbrief = "详情解析出错", exceptionmessage = ex.Message, statuscode = 500, serviceid = crawler.id }; await CommonHelper.SaveException(ee); } }; infoDetailCrawler.Start(new Uri(url), encode).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 解析数据 /// </summary> /// <param name="html">原始html</param> /// <param name="crawler">抓取服务配置</param> /// <param name="infoNode">xml配置数据</param> /// <param name="db">数据库链接</param> /// <param name="encode">html编码格式</param> /// <param name="url">抓取原始url</param> /// <returns></returns> public static void SaveInfomation(string html, v_crawler crawler, XmlDocument infoNode, BizDataContext db) { //页面数据的根节点 var rootNode = CommonHelper.GetRootNode(html); //配置数据的根节点 var xmlRoot = infoNode.SelectSingleNode("data"); //获取信息列表的配置节点 var listConfig = xmlRoot.SelectSingleNode("ListConfig"); //这里是为了最终找到modelListNodes变量的值,也就是新闻列表 //在此之前可能需要多次的剥离无效数据,因此使用了foreach循环 //正常流程应该是多次SelectSingleNode后,进行一次SelectNodes,获取到新闻列表 HtmlNode modelNode = null; HtmlNodeCollection modelListNodes = null; foreach (XmlNode item in listConfig.ChildNodes) { if (modelNode == null) { modelNode = rootNode; } if (item.Attributes["issingleselect"].Value.ToBool()) { //多次剥离无效数据 modelNode = modelNode.SelectSingleNode(item.Attributes["signstr"].Value); } else { //最终获取到信息列表,此时应该循环结束 modelListNodes = modelNode.SelectNodes(item.Attributes["signstr"].Value); break; } } //获取对新闻实体解析的配置节点 var infoConfig = xmlRoot.SelectSingleNode("InfoConfig"); //对上面获取到的新闻列表循环处理 foreach (HtmlNode info in modelListNodes) { T_Information entity = new T_Information(); var detailUrl = string.Empty; //新闻解析应该包含多个子节点,每个子节点表示一个属性,这里进行循环赋值 foreach (XmlNode property in infoConfig.ChildNodes) { if (property.Name == "property") { entity = CommonHelper.GetProperty(entity, info, property); } else if (property.Name == "DetailUrl") { detailUrl = GetUrl(info, property, crawler.url); } } var count = db.Set <T_Information>().Where(p => p.Information_Id == entity.Information_Id).Select(p => 1).Count(); if (count >= 1) { return; } entity.State = (int)T_InformationStateEnum.Publish; entity.InformationType = (int)t_informationtypeenum.customcontent; entity.InfoTypeIds = crawler.infotype + ","; if (!string.IsNullOrEmpty(detailUrl)) { //循环赋值完成后,前往新闻详情页获取新闻详情,完善新闻实体 InfomationDetailCrawler(detailUrl, xmlRoot, entity, crawler, crawler.encode); } entity.OriginalSourceUrl = detailUrl; } }
private async static Task SaveInfomationDetail(string pageSource, T_Information info, XmlNode infoNode, BizDataContext db, v_crawler crawler, string detailUrl) { var rootNode = CommonHelper.GetRootNode(pageSource); if (info != null) { var xmlRoot = infoNode.SelectSingleNode("DataDetail"); var detailHost = new Regex(@"\S+/").Match(detailUrl).Value; //新闻解析应该包含多个子节点,每个子节点表示一个属性,这里进行循环赋值 foreach (XmlNode property in xmlRoot.ChildNodes) { info = CommonHelper.GetProperty(info, rootNode, property, detailHost); } var info_infotag = new T_InfoType_Information() { InfoType_Information_Id = await db.GetNextIdentity_IntAsync(), CreateTime = DateTime.Now, InformationId = info.Information_Id, InfoTypeId = crawler.infotype, }; var informationcontent = new T_Information_Content() { Information_Content_Id = await db.GetNextIdentity_IntAsync(), Conent = info.Content, ContentType = (int)Information_Content_ContentTypeEnum.TextWords, OrderIndex = 0, InformationId = info.Information_Id, State = 0, CreateTime = DateTime.Now, }; info.ClassifyId = (int)InfoSource.Crawler; info.PublishTime = info.CreateTime; info.IsTop = false; if (!string.IsNullOrEmpty(informationcontent.Conent)) { var regex = new Regex("<img.*?>"); var imgMtach = regex.Match(info.Content); if (imgMtach.Success) { var img = imgMtach.Value; var srcMatch = new Regex("src=\".*?\"").Match(img); if (srcMatch.Success) { var src = srcMatch.Value; var att = new T_Attachment() { Attachment_ID = await db.GetNextIdentity_IntAsync(), FilePath = src, State = 0, CreateTime = DateTime.Now, }; await db.InsertAsync(att); } } await db.InsertAsync(info); await db.InsertAsync(info_infotag); await db.InsertAsync(informationcontent); } } }