public void SetUp() { news = new DtoNews() { Author = "test", Contents = DateTime.Now + "test", CreateTime = DateTime.Now, FromSiteName = "testsite", FromUrl = "http://localhost", //ImgFlag = 0, Title = "test1" + DateTime.Now, LogoUrl = "http://n.sinaimg.cn/fo/transform/20160705/pBto-fxtspsa6682768.jpg", NewsTypeId = (int)NewsTypeEnum.新闻, PubTime = DateTime.Now, IsShow = 1, LogoOriginalUrl = "", AuthorId = "", TotalComments = 0, Tags = "", NewsHotClass = 7, LastReadTimes = 0, LastDealTime = DateTime.Now, IsHot = 0, IsDeal = 0, IntervalMinutes = 60, CurReadTimes = 0, }; }
public void InsertTest() { var id = DalNews.Insert(news); Assert.IsTrue(id > 0); var result = DalNews.GetNews(id); Assert.IsTrue(result.Id == id); var news2 = new DtoNews() { Id = result.Id, CurReadTimes = 10000, }; DalNews.UpdateNews(news2); var isHave = DalNews.IsExistsNews(result.Title); Assert.IsTrue(isHave); //DalNews.UpdateImgFlag(id, 1); //result = DalNews.GetNews(id); //Assert.IsTrue(result.ImgFlag == 1); var result2 = DalNews.DelNews(id); Assert.IsTrue(result2); }
public static int UpdateNews_Bjh(DtoNews model) { try { var news = new T_News_Bjh() { Id = model.Id, CurReadTimes = model.CurReadTimes, LastDealTime = DateTime.Now, LastReadTimes = model.LastReadTimes, IsHot = model.IsHot, IsDeal = 1, TotalComments = model.TotalComments, NewsHotClass = model.NewsHotClass, IntervalMinutes = model.IntervalMinutes, GroupId = model.GroupId, }; return(Sql.Update(news, "Id={0}".Formats(model.Id))); } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); } return(1); }
public DtoNews NewsGathering(string newsUrl) { try { var title = ""; var content = ""; var pubTime = ""; var from = ""; var author = ""; var strNewContent = HttpHelper.GetContentByMobileAgent(newsUrl, Encoding.UTF8); content = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_real']", "").Trim(); pubTime = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_sth']/p/span[1]", ""); pubTime = StrHelper.FormatHtml(pubTime).Trim(); from = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_sth']/p/span[3]/span", ""); from = StrHelper.FormatHtml(from).Trim(); author = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_sth']/p/span[4]/span", ""); author = StrHelper.FormatHtml(author).Trim(); if (string.IsNullOrWhiteSpace(from)) { from = "凤凰佛教"; } content = DealContent(content); var news = new DtoNews { Contents = content, Title = title, PubTime = StrHelper.ToDateTime(pubTime), FromUrl = newsUrl, FromSiteName = from, Author = author, CreateTime = DateTime.Now, IsShow = 0, }; return(news); } catch (Exception ex) { Log.Error(newsUrl + " 错误:" + ex.Message + ex.StackTrace); } return(null); }
public int DealAuthorData(string url, string authorId, string groupId, int AuthorPageIndex) { //var url = "http://baijiahao.baidu.com/api/content/article/listall?sk=super&ak=super&app_id={0}&_skip={1}&_limit=12"; var skip = AuthorPageIndex * 12; if (skip == 0) { url = string.Format(url, authorId, skip); } else { url = url.Replace((skip - 12).ToString(), skip.ToString()); } var strContent = ""; try { Log.Info(url + " 百家号抓取开始 页码" + AuthorPageIndex); strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); if (string.IsNullOrWhiteSpace(strContent)) { //重新请求一次,因为用了代理后,经常会失败 strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); if (string.IsNullOrWhiteSpace(strContent)) { //HttpHelper.IsUseProxy = false; //重新请求一次,因为用了代理后,经常会失败 strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); //HttpHelper.IsUseProxy = true; if (string.IsNullOrWhiteSpace(strContent)) { Log.Info(url + " 未抓取到任何内容 页码" + AuthorPageIndex); return(0); } } } var isHaveMore = false; //strContent = FormatJsonData(strContent); var data = JsonConvert.DeserializeObject <DtoBaijiahaoAuthorJsData>(strContent); if (data != null) { Log.Info(url + " 页码" + AuthorPageIndex); #region === 处理data中的数据,存储新闻信息 === if (data.items != null && data.items.Count > 0) { if (data.total > (AuthorPageIndex + 1) * 12) { isHaveMore = true; } foreach (var subItem in data.items) { try { var pubTime = Comm.Tools.Utility.StringConverter.ToDateTime(subItem.publish_at); //一个月前的新闻不抓取 if (pubTime.AddMonths(1) < DateTime.Now) { Log.Info("发布时间在1月前不入库 appid=" + subItem.app_id + " pubtime=" + subItem.publish_at + " title=" + subItem.title); continue; } var newsId = DalNews.IsExistsNews_Bjh(authorId, subItem.title); if (newsId < 1) { #region === 不存在的插入=== var model = new DtoNews() { Author = "", AuthorId = authorId, Contents = "", CreateTime = DateTime.Now, //CurReadTimes = Global.ToInt(subItem.read_amount), CurReadTimes = subItem.read_amount, FromSiteName = "baijiahao", FromUrl = subItem.url, IntervalMinutes = 60, IsDeal = 0, IsHot = 0, IsShow = 1, LastDealTime = DateTime.Now, LastReadTimes = subItem.read_amount, LogoOriginalUrl = subItem.url, LogoUrl = "", NewsHotClass = 7, NewsTypeId = (int)NewsTypeEnum.新闻, PubTime = Comm.Tools.Utility.StringConverter.ToDateTime(subItem.publish_at), Tags = subItem.tag, Title = subItem.title, TotalComments = subItem.comment_amount, RefreshTimes = 0, GroupId = subItem.app_id }; DalNews.Insert_News_Bjh(model); #endregion } else { #region === 存在的则更新数据 === var oldNews = DalNews.GetNews_Bjh(newsId); if (oldNews != null) { //b、变化数据,如果是当天发稿的文章,每15分钟刷新一次阅读量,如果5、6、7级,则改为小时更新; //7天内发稿的文章,每一小时更新一次阅读数; //7天以上,每天刷新; //(这个可以按欢迎度级别优化,如15分钟阅读增加在10000以上为1级,5000以上为2级,2500以上为3级,1000以上为4级,500以上为5级,100以上为6级,100以下为7级) var isHot = 0; var minutes = (DateTime.Now - oldNews.LastDealTime).TotalMinutes; var newsClassId = 7; var addReads = subItem.read_amount - oldNews.CurReadTimes; var intervalMinutes = 24 * 60; if (addReads > 0) { if (minutes > 60) { var perHourReads = addReads / (minutes / 60.0); if (perHourReads > 10000) { isHot = 1; } } else { if (addReads > 10000) { isHot = 1; } } #region === 15分钟阅读量分析 === var per15MinutesReads = addReads / (minutes / 15.0); if (per15MinutesReads > 10000) { newsClassId = 1; isHot = 1; intervalMinutes = 15; } else if (per15MinutesReads > 5000) { newsClassId = 2; isHot = 1; intervalMinutes = 15; } else if (per15MinutesReads > 2500) { newsClassId = 3; intervalMinutes = 15; } else if (per15MinutesReads > 1000) { newsClassId = 4; intervalMinutes = 15; } else if (per15MinutesReads > 500) { newsClassId = 5; intervalMinutes = 60; } else if (per15MinutesReads > 100) { newsClassId = 6; intervalMinutes = 60; } else { newsClassId = 7; intervalMinutes = 60; } #endregion } if (oldNews.PubTime.AddHours(24) < DateTime.Now) { //不是今天发布的 intervalMinutes = 24 * 60; } //如果原来是爆文的不修改ishot if (oldNews.IsHot == 1) { isHot = 1; } if (oldNews.NewsHotClass < newsClassId) { newsClassId = oldNews.NewsHotClass; } var model = new DtoNews() { Id = newsId, LastReadTimes = oldNews.CurReadTimes, CurReadTimes = subItem.read_amount, IsHot = isHot, IsDeal = 1, TotalComments = subItem.comment_amount, IntervalMinutes = intervalMinutes, NewsHotClass = newsClassId, LastDealTime = DateTime.Now, }; DalNews.UpdateNews_Bjh(model); //暂不更新作者表的刷新时间,没用上 //DalNews.UpdateAuthorInterval(authorId, intervalMinutes); } #endregion } } catch (Exception ex) { } } } #endregion //Random rnd = new Random(); //有更多数据,则继续抓取数据 if (isHaveMore) { //sleep //Thread.Sleep(rnd.Next(1000, 2500)); Thread.Sleep(200); AuthorPageIndex++; DealAuthorData(url, authorId, groupId, AuthorPageIndex); } else { Log.Info("本百家号抓取结束总页数" + AuthorPageIndex); //置位状态 //DalNews.UpdateAuthorIsDeal(authorId, 1); AuthorPageIndex = 0; //Thread.Sleep(rnd.Next(2000, 5000)); Thread.Sleep(200); } } else { Log.Info(url + " 百家号未取到数据 页码" + AuthorPageIndex); } } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); Log.Debug("======strContent begin 百家号抓取========="); Log.Debug(url); Log.Debug(strContent); Log.Debug("======strContent end ========="); } return(1); }
public List <DtoNewsUrlList> NewsUrlGathering(string newsListUrl, int newsType) { try { Log.Info(newsListUrl + " 抓取开始"); var strContent = HttpHelper.GetContentByMobileAgent(newsListUrl, Encoding.GetEncoding("gb2312")); if (string.IsNullOrWhiteSpace(strContent)) { Log.Info(newsListUrl + " 未抓取到任何内容"); return(null); } //取得标题列表 var strList = XpathHelper.GetInnerHtmlListByXPath(strContent, "//div[@class='leftList']/ul/li"); if (strList != null && strList.Count > 0) { foreach (var item in strList) { try { var url = XpathHelper.GetAttrValueByXPath(item, "//a", "href"); var title = XpathHelper.GetInnerHtmlByXPath(item, "//a", ""); title = StrHelper.FormatHtml(title).Trim(); var isHave = DalNews.IsExistsNews(title); //如果已存在则跳过 if (isHave) { continue; } if (newsType == 100 || newsType == 200 || newsType == 300) { #region === 根据详细页地址取新闻内容 === var news = NewsGathering(url); if (news != null) { news.NewsTypeId = newsType; news.Title = title; news.PubTime = StrHelper.ToDateTime(StrHelper.FormatPubTime(news.PubTime.ToString())); //入库 var newsId = DalNews.Insert(news); if (newsId < 1) { continue; } //从内容中提取img,存入newsmedia var mediaList = ImgDeal.GetImgList(news.Contents); if (mediaList != null && mediaList.Count > 0) { news.Contents = mediaList[0].Description; foreach (var picitem in mediaList) { picitem.NewsId = newsId; DalNews.InsertMedia(picitem); } } //休眠 控制抓取的频率 Random rnd = new Random(); var sleepSeconds = rnd.Next(30, 90); Thread.Sleep(sleepSeconds * 1000); } #endregion } if (newsType == 400) { #region === 根据详细页地址取图片内容 === var mediaList = NewsPicGathering(url); var news = new DtoNews() { Title = title, FromUrl = url, NewsTypeId = newsType, }; if (mediaList != null && mediaList.Count > 0) { news.Contents = mediaList[0].Description; //入库 var newsId = DalNews.Insert(news); foreach (var picitem in mediaList) { picitem.NewsId = newsId; DalNews.InsertMedia(picitem); } } //休眠 控制抓取的频率 Random rnd = new Random(); var sleepSeconds = rnd.Next(30, 90); Thread.Sleep(sleepSeconds * 1000); #endregion } Log.Info(url + " 抓取完成"); } catch (Exception ex) { //Log.Error("内容: " + item); Log.Error(ex.Message + ex.StackTrace); } } } Log.Info(newsListUrl + " 抓取结束"); return(null); } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); } return(null); }
public DtoNews NewsGathering(string newsUrl) { try { var title = ""; var content = ""; var pubTime = ""; var from = ""; var author = ""; var picUrl = ""; var strNewContent = HttpHelper.GetContent(newsUrl, Encoding.GetEncoding("gb2312")); content = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='Cnt-Main-Article-QQ']", ""); //从content里去除最下面的广告部分 //var contentlast = StrHelper.GetStrByXPath(content, "//span[last()]", ""); //content = content.Replace(contentlast, ""); //content = Regex.Replace(content, contentlast, "", RegexOptions.IgnoreCase); //从content里去除最上面的分享部分 //var contentfirst = StrHelper.GetStrByXPath(strNewContent, "//div[@class='tit-bar clearfix']", ""); //content = content.Replace(contentfirst, ""); //content = content.Replace("<div class='tit-bar clearfix' bosszone='titleDown'></div>",""); content = content.Trim(); pubTime = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//span[@class='article-time']", ""); pubTime = StrHelper.FormatHtml(pubTime).Trim(); from = "腾讯佛学"; //from = StrHelper.GetStrByXPath(strNewContent, "//span[@bosszone='jgname']/a", ""); //from = StrHelper.FormatHtml(from); var picUrlList = XpathHelper.GetAttrValueListByXPath(content, "//img", "src"); if (picUrlList != null && picUrlList.Count > 0) { picUrl = picUrlList[0]; } author = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='C-Main-Article-QQ']/div[1]/div/div[1]/span[5]", ""); author = StrHelper.FormatHtml(author).Trim(); //*[@id="Cnt-Main-Article-QQ"]/p/div[@r='1'] content = DealContent(content); var news = new DtoNews { Contents = content, Title = title, PubTime = StrHelper.ToDateTime(pubTime), FromUrl = newsUrl, FromSiteName = from, Author = author, CreateTime = DateTime.Now, IsShow = 1, LogoOriginalUrl = picUrl, LogoUrl = picUrl }; return(news); } catch (Exception ex) { Log.Error(newsUrl + " 错误:" + ex.Message + ex.StackTrace); } return(null); }
/// <summary> /// 添加一条新闻 /// </summary> /// <param name="model">新闻实体</param> /// <returns></returns> public static int Insert(DtoNews model) { try { if (model.Author == null) { model.Author = ""; } if (model.Contents == null) { model.Contents = ""; } if (model.FromSiteName == null) { model.FromSiteName = ""; } if (model.PubTime == null) { model.PubTime = DateTime.Now; } if (model.FromUrl == null) { model.FromUrl = ""; } if (model.LogoOriginalUrl == null) { model.LogoOriginalUrl = ""; } if (model.LogoUrl == null) { model.LogoUrl = ""; } if (model.Title == null) { model.Title = ""; } ////非图片的,且内容小于100的不入库 //if (model.NewsTypeId != NewsTypeEnum.图片 && model.Contents.Length < 100) //{ // return -1; //} //if(string.IsNullOrWhiteSpace(model.Title.Trim())) //{ // return -1; //} var item = new T_News() { Author = model.Author, Contents = model.Contents, //CreateTime = model.CreateTime, FromSiteName = model.FromSiteName, FromUrl = model.FromUrl, IsShow = 0, LogoOriginalUrl = model.LogoOriginalUrl, LogoUrl = model.LogoUrl, NewsTypeId = (int)model.NewsTypeId, PubTime = model.PubTime, Title = model.Title, AuthorId = model.AuthorId, TotalComments = model.TotalComments, Tags = model.Tags, NewsHotClass = model.NewsHotClass, LastReadTimes = model.LastReadTimes, LastDealTime = DateTime.Now, IsHot = model.IsHot, IsDeal = model.IsDeal, IntervalMinutes = model.IntervalMinutes, CurReadTimes = model.CurReadTimes, CreateTime = DateTime.Now, GroupId = model.GroupId, }; var id = Sql.InsertId <T_News>(item); return(id); } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); return(-1); } }