public override void GetNewsDetail(ChannelConfig chlCfg) { var url = string.Format(site.IndexUrl, chlCfg.ChannelVal); NewsCategory cate; if (chlCfg.ChannelName != "网易原创") { cate = cateList.Find(p => p.CategoryName == chlCfg.ChannelName.Replace("网易", "")); } else { cate = cateList.Find(p => p.CategoryName == chlCfg.ChannelName); } if (cate == null) { return; } try { var str = HttpUtility.Get(url); str = str.Replace("{\"" + chlCfg.ChannelVal + "\":[", "[").Replace("]}", "]"); var articlList = SerilizeService<List<ArticleList>>.CreateSerilizer(Serilize_Type.Json).Deserilize(str); foreach (var item in articlList) { if (string.IsNullOrEmpty(item.url)) { continue; } //var newsParam = SqlParamHelper.GetDefaultParam(1, 10, "NewsId", true); //newsParam.where.where.Add(SqlParamHelper.CreateWhere( //PARAM_TYPE.EQUATE, LINK_TYPE.AND, "SourceUrl", item.url)); var newsItem = newsItemAccess.Find(p => p.SourceUrl == item.url); if (newsItem == null) { newsItem = new NewsItem() { NewsId = -1, SourceUrl = item.url, SourceSite = site.SiteId, Author = "" }; } else { continue; } newsItem.CategoryId = cate.CategoryId; newsItem.Title = item.title; newsItem.CreateTime = item.ptime; newsItem.FromSite = item.source; newsItem.ImgUrl = item.imgsrc ?? ""; newsItem.ChannelName = chlCfg.ChannelName; //采集内容 http://c.3g.163.com/nc/article/B0DQ29J400031H2L/full.html HtmlDocument doc = new HtmlDocument(); HtmlNode.ElementsFlags.Remove("option"); for (int i = 0; i < 5; i++) { try { var sourceUrl = string.Format(site.DetailUrl, item.docid); str = HttpUtility.Get(sourceUrl); str = str.Replace("{\"" + item.docid + "\":{", "{").Replace("}}", "}"); var article = SerilizeService<ArticleDetail>.CreateSerilizer(Serilize_Type.Json).Deserilize(str); if (article.img != null) { foreach (var img in article.img) { article.body = string.Format("<p><img src=\"{0}\" itemprop=\"image\" alt=\"\" >{1}</ p >", img.src, img.alt) + article.body; } } doc.LoadHtml(article.body); break; } catch (Exception ex) { Logger.WriteException(string.Format("请求详情页失败,次数:{0} , url:{1}", i, newsItem.SourceUrl), ex); } } try { var div = doc.DocumentNode; if (string.IsNullOrEmpty(div.InnerText)) { continue; } var matchstr = (string)div.InnerText.ToString().Clone(); MatchCollection matchList = Regex.Matches(matchstr, @"<!--\S*-->"); foreach (Match match in matchList) { matchstr = matchstr.Replace(match.Value, ""); } newsItem.NewsText = matchstr; RemoveUnsafe(div); newsItem.NewsContent = div.InnerHtml; //保存新闻列表 newsItemAccess.Add(newsItem); newsItemAccess.Save(); SaveSegMents(newsItem); } catch (Exception ex) { Logger.WriteException("保存新闻异常", ex); } } } catch (Exception ex) { Logger.WriteException(string.Format("分类下抓取新闻出现异常:{0}", chlCfg.ChannelName), ex); } }
public override void UpdateSiteChl() { //分类整理 var url = site.ChannelUrl;// "http://c.3g.163.com/nc/topicset/ios/subscribe/manage/listspecial.html"; var str = HttpUtility.Get(url); str = str.Replace("{\"tList\":[", "[").Replace("]}", "]"); var topicList = SerilizeService<List<Topic>>.CreateSerilizer(Serilize_Type.Json).Deserilize(str); foreach (var item in topicList) { item.tname = "网易" + item.tname; var chlCfg = new ChannelConfig(); chlCfg.ChannelId = -1; chlCfg.ChannelName = item.tname; chlCfg.ChannelVal = item.tid; chlCfg.SiteId = site.SiteId; var _chl = chlCfgAccess.Find( p => p.ChannelVal == item.tid && p.ChannelName == item.tname && p.SiteId == 3 ); if (_chl == null) { chlCfgAccess.Add(chlCfg); chlCfgAccess.Save(); } else { continue; } } }
public override void GetNewsDetail(ChannelConfig chlCfg) { var url = string.Format(site.IndexUrl + "&limit=500", chlCfg.ChannelVal); var cate = cateList.Find(p => p.CategoryName == "财经"); try { HttpResponse<JSON> response = new CommonService.HttpResponse<JSON>(); var json = response.GetFuncGetResponse(url, Serilize_Type.Json); //yw?encode=ywjh&limit=500 var uri = url.Split('?')[0]; foreach (var item in json.news) { //var newsParam = SqlParamHelper.GetDefaultParam(1, 10, "NewsId", true); //newsParam.where.where.Add(SqlParamHelper.CreateWhere( //PARAM_TYPE.EQUATE, LINK_TYPE.AND, "SourceUrl", item.url_m)); var newsItem = newsItemAccess.Find( p => p.SourceUrl == item.url_m ); if (newsItem == null) { newsItem = new NewsItem() { NewsId = -1, SourceUrl = item.url_m, SourceSite = site.SiteId, FromSite = "东方财富网", Author = "" }; } else { continue; } newsItem.CategoryId = cate.CategoryId; newsItem.Title = item.title; newsItem.CreateTime = item.showtime; newsItem.ImgUrl = item.image ?? ""; newsItem.ChannelName = chlCfg.ChannelName; //采集内容 HtmlDocument doc = new HtmlDocument(); HtmlNode.ElementsFlags.Remove("option"); url = uri.Replace("yw", "content?newsid=" + item.newsid.ToString()); for (int i = 0; i < 5; i++) { try { HttpResponse<ContentJson> contentResponse = new CommonService.HttpResponse<ContentJson>(); var contentJson = contentResponse.GetFuncGetResponse(url, Serilize_Type.Json); doc.LoadHtml(contentJson.body); break; } catch (Exception ex) { Logger.WriteException(string.Format("请求详情页失败,次数:{0} , url:{1}", i, newsItem.SourceUrl), ex); } } try { var div = doc.DocumentNode; if (string.IsNullOrEmpty(div.InnerText)) { continue; } newsItem.NewsText = div.InnerText.Replace("<!-- EM_StockImg_Start --><!--IMG#0--><!-- EM_StockImg_End -->", ""); RemoveUnsafe(div); newsItem.NewsContent = div.InnerHtml; //保存新闻列表 newsItemAccess.Add(newsItem); newsItemAccess.Save(); SaveSegMents(newsItem); } catch (Exception ex) { Logger.WriteException("保存内容异常", ex); } } } catch (Exception ex) { Logger.WriteException(string.Format("分类下抓取新闻出现异常:{0}", chlCfg.ChannelName), ex); } }
public override void GetNewsDetail(ChannelConfig chlCfg) { var url = string.Format(site.IndexUrl, chlCfg.ChannelVal); var cate = cateList.Find(p => p.CategoryName == "财经"); try { while (!string.IsNullOrEmpty(url)) { HttpResponse<xmlColumn> response = new CommonService.HttpResponse<xmlColumn>(); var tonghuaSunXmlColumn = response.GetFuncGetResponse(url, Serilize_Type.Xml); DateTime dt = System.DateTime.Now; url = tonghuaSunXmlColumn.nextPage ?? ""; foreach (var item in tonghuaSunXmlColumn.pageItems.item) { //var newsParam = SqlParamHelper.GetDefaultParam(1, 10, "NewsId", true); //newsParam.where.where.Add(SqlParamHelper.CreateWhere( //PARAM_TYPE.EQUATE, LINK_TYPE.AND, "SourceUrl", item.url)); var newsItem = newsItemAccess.Find(p => p.SourceUrl == item.url); if (newsItem == null) { newsItem = new NewsItem() { NewsId = -1, SourceUrl = item.url, SourceSite = site.SiteId, Author = "" }; } else { continue; } newsItem.CategoryId = cate.CategoryId; newsItem.Title = item.title; DateTime.TryParse(item.ctime, out dt); newsItem.CreateTime = dt; newsItem.FromSite = item.source; newsItem.ImgUrl = item.imgurl ?? ""; newsItem.ChannelName = tonghuaSunXmlColumn.columnName; //采集内容 HtmlDocument doc = new HtmlDocument(); HtmlNode.ElementsFlags.Remove("option"); for (int i = 0; i < 5; i++) { try { doc.LoadHtml(HttpUtility.Get(newsItem.SourceUrl, Encoding.UTF8)); break; } catch (Exception ex) { Logger.WriteException(string.Format("请求详情页失败,次数:{0} , url:{1}", i, newsItem.SourceUrl), ex); } } try { var div = doc.DocumentNode.SelectSingleNode("//div[@id='content']"); if (string.IsNullOrEmpty(div.InnerText)) { continue; } newsItem.NewsText = div.InnerText; RemoveUnsafe(div); newsItem.NewsContent = div.InnerHtml; //保存新闻列表 newsItemAccess.Add(newsItem); newsItemAccess.Save(); SaveSegMents(newsItem); } catch (Exception ex) { Logger.WriteException("保存新闻异常", ex); } } } } catch (Exception ex) { Logger.WriteException(string.Format("分类下抓取新闻出现异常:{0}", chlCfg.ChannelName), ex); } }
public override void UpdateSiteChl() { HttpResponse<xmlIndex> httpResponse = new HttpResponse<xmlIndex>(); var listIndex = httpResponse.GetFuncGetResponse(site.ChannelUrl, Serilize_Type.Xml); foreach (var item in listIndex.item) { var chlCfg = new ChannelConfig(); chlCfg.ChannelId = -1; chlCfg.ChannelName = item.name; chlCfg.ChannelVal = item.columnId; chlCfg.SiteId = site.SiteId; var _chl = chlCfgAccess.Find(p => p.SiteId == chlCfg.SiteId && p.ChannelVal == chlCfg.ChannelVal && p.ChannelName == chlCfg.ChannelName ); if (_chl != null) { chlCfg.ChannelId = _chl.ChannelId; chlCfg.ChannelName = item.name; chlCfg.ChannelVal = item.columnId; chlCfg.SiteId = site.SiteId; } else { chlCfgAccess.Add(chlCfg); } chlCfgAccess.Save(); } }
public virtual void GetNewsDetail(ChannelConfig chlCfg) { }