public List <DtoNewsMedia> NewsPicGatheringOne(string newsUrl) { try { var list = new List <DtoNewsMedia>(); var content = ""; var picUrl = ""; var curPage = 0; var totalPage = 1; var strNewContent = HttpHelper.GetContentByMobileAgent(newsUrl, Encoding.UTF8); var strScriptList = XpathHelper.GetInnerHtmlListByXPath(strNewContent, "/html/body/script"); var strSrcipt = ""; if (strScriptList != null && strScriptList.Count > 0) { foreach (var str in strScriptList) { if (str.Contains("G_listdata")) { strSrcipt = str; break; } } } var istrart = strSrcipt.IndexOf('['); var iend = strSrcipt.IndexOf(']'); var strJson = strSrcipt.Substring(istrart, iend - istrart + 1); var imgList = JsonConvert.DeserializeObject <List <DtoIfengImg> >(strJson); //不通过网页html分析内容了,因为要的内容在script中 //var strcurPage = StrHelper.GetStrByXPath(strNewContent, "//div[@id='picTxt']/div/span[1]", ""); //curPage = Convert.ToInt32(StrHelper.FormatHtml(strcurPage)); //var strtotalPage = StrHelper.GetStrByXPath(strNewContent, "//div[@id='picTxt']/div/span[3]", ""); //totalPage = Convert.ToInt32(StrHelper.FormatHtml(strtotalPage)); //content = StrHelper.GetStrByXPath(strNewContent, "//div[@id='picTxt']/ul/li/p", ""); //content = StrHelper.FormatHtml(content); //picUrl = StrHelper.GetAttrValueByXPath(strNewContent, "//img[@id='photo']", "src"); //*[@id="picTxt"]/ul/li/p if (imgList != null && imgList.Count > 0) { curPage = 1; foreach (var img in imgList) { //id 临时用来记录总图片数 var model = new DtoNewsMedia { Description = img.title.Trim(), Orders = curPage, PicUrl = img.img, PicOriginalUrl = img.originalimg, IsShow = 1, Id = imgList.Count }; curPage++; list.Add(model); } } return(list); } catch (Exception ex) { Log.Error(newsUrl + " 错误:" + ex.Message + ex.StackTrace); } return(null); }
public List <DtoNewsUrlList> NewsUrlGathering(string newsListUrl, int newsType) { try { Log.Info(newsListUrl + " 抓取开始"); var strContent = HttpHelper.GetContentByMobileAgent(newsListUrl, Encoding.GetEncoding("gb2312")); if (string.IsNullOrWhiteSpace(strContent)) { Log.Info(newsListUrl + " 未抓取到任何内容"); return(null); } //取得标题列表 var strList = XpathHelper.GetInnerHtmlListByXPath(strContent, "//div[@class='leftList']/ul/li"); if (strList != null && strList.Count > 0) { foreach (var item in strList) { try { var url = XpathHelper.GetAttrValueByXPath(item, "//a", "href"); var title = XpathHelper.GetInnerHtmlByXPath(item, "//a", ""); title = StrHelper.FormatHtml(title).Trim(); var isHave = DalNews.IsExistsNews(title); //如果已存在则跳过 if (isHave) { continue; } if (newsType == 100 || newsType == 200 || newsType == 300) { #region === 根据详细页地址取新闻内容 === var news = NewsGathering(url); if (news != null) { news.NewsTypeId = newsType; news.Title = title; news.PubTime = StrHelper.ToDateTime(StrHelper.FormatPubTime(news.PubTime.ToString())); //入库 var newsId = DalNews.Insert(news); if (newsId < 1) { continue; } //从内容中提取img,存入newsmedia var mediaList = ImgDeal.GetImgList(news.Contents); if (mediaList != null && mediaList.Count > 0) { news.Contents = mediaList[0].Description; foreach (var picitem in mediaList) { picitem.NewsId = newsId; DalNews.InsertMedia(picitem); } } //休眠 控制抓取的频率 Random rnd = new Random(); var sleepSeconds = rnd.Next(30, 90); Thread.Sleep(sleepSeconds * 1000); } #endregion } if (newsType == 400) { #region === 根据详细页地址取图片内容 === var mediaList = NewsPicGathering(url); var news = new DtoNews() { Title = title, FromUrl = url, NewsTypeId = newsType, }; if (mediaList != null && mediaList.Count > 0) { news.Contents = mediaList[0].Description; //入库 var newsId = DalNews.Insert(news); foreach (var picitem in mediaList) { picitem.NewsId = newsId; DalNews.InsertMedia(picitem); } } //休眠 控制抓取的频率 Random rnd = new Random(); var sleepSeconds = rnd.Next(30, 90); Thread.Sleep(sleepSeconds * 1000); #endregion } Log.Info(url + " 抓取完成"); } catch (Exception ex) { //Log.Error("内容: " + item); Log.Error(ex.Message + ex.StackTrace); } } } Log.Info(newsListUrl + " 抓取结束"); return(null); } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); } return(null); }