/// <summary> /// 抓取百家号主页的list,抓取文章阅读量等数据, /// </summary> /// <returns></returns> public int GatheringNewsFromAuthor() { try { //取出待处理百家号的数据,并置位isdeal=2 处理中 var list = DalNews.GetNoDealAuthorList_Bjh(); #region === 取出待刷新的百家号url数据 === if (list != null && list.Count > 0) { foreach (var item in list) { if (!string.IsNullOrWhiteSpace(item.AuthorId)) { var url = "http://baijiahao.baidu.com/api/content/article/listall?sk=super&ak=super&app_id={0}&_skip={1}&_limit=12"; DealAuthorData(url, item.AuthorId, item.GroupId, 0); } } //Thread.Sleep(5 * 1000); Thread.Sleep(200); } else { Log.Info("暂时没有要处理的百家号url"); Thread.Sleep(60 * 1000); } #endregion } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); } return(0); }
public int GatheringAuthorUrlSearch() { if (true) { //var keywords = Global.BjhSearchKeywords.Split(','); var keywords = DalNews.GetNoDealKeyword(); foreach (var keyword in keywords) { if (keyword.Keyword != null) { //GatheringAuthorUrlFromSearch(keyword, 100, 0); GatheringAuthorUrlFromSearch("intitle%3A\"" + keyword.Keyword + "\"%20", 100, 0); Thread.Sleep(2 * 1000); } //try //{ // var arrKeyword = keyword.Split(' '); // if (arrKeyword != null && arrKeyword.Length > 0) // { // foreach (var singleKeyword in arrKeyword) // { // if (!string.IsNullOrWhiteSpace(singleKeyword)) // { // for (var i = 0; i < singleKeyword.Length; i++) // { // GatheringAuthorUrlFromSearch("intitle%3A\"" + singleKeyword.Substring(i, 1) + "\"%20", 100, 0); // //GatheringAuthorUrlFromSearch("%2B\"" + singleKeyword.Substring(i, 1) + "\"%20百家号%20贡献文章", 100, 0); // GatheringAuthorUrlFromSearch(singleKeyword.Substring(i, 1) + " 百家号 ", 100, 0); // //https://www.baidu.com/s?wd=%2B"娱乐"%20百家号%20贡献文章%20inurl%3Abaijiahao.baidu.com%2Fu%3Fapp_id // Thread.Sleep(2 * 1000); // } // } // } // } //} //catch (Exception ex) //{ // Log.Error(ex); // Thread.Sleep(60 * 1000); //} } } else { } return(0); }
public int DealAuthorData(string url, string authorId, string groupId, int AuthorPageIndex) { //var url = "http://baijiahao.baidu.com/api/content/article/listall?sk=super&ak=super&app_id={0}&_skip={1}&_limit=12"; var skip = AuthorPageIndex * 12; if (skip == 0) { url = string.Format(url, authorId, skip); } else { url = url.Replace((skip - 12).ToString(), skip.ToString()); } var strContent = ""; try { Log.Info(url + " 百家号抓取开始 页码" + AuthorPageIndex); strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); if (string.IsNullOrWhiteSpace(strContent)) { //重新请求一次,因为用了代理后,经常会失败 strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); if (string.IsNullOrWhiteSpace(strContent)) { //HttpHelper.IsUseProxy = false; //重新请求一次,因为用了代理后,经常会失败 strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); //HttpHelper.IsUseProxy = true; if (string.IsNullOrWhiteSpace(strContent)) { Log.Info(url + " 未抓取到任何内容 页码" + AuthorPageIndex); return(0); } } } var isHaveMore = false; //strContent = FormatJsonData(strContent); var data = JsonConvert.DeserializeObject <DtoBaijiahaoAuthorJsData>(strContent); if (data != null) { Log.Info(url + " 页码" + AuthorPageIndex); #region === 处理data中的数据,存储新闻信息 === if (data.items != null && data.items.Count > 0) { if (data.total > (AuthorPageIndex + 1) * 12) { isHaveMore = true; } foreach (var subItem in data.items) { try { var pubTime = Comm.Tools.Utility.StringConverter.ToDateTime(subItem.publish_at); //一个月前的新闻不抓取 if (pubTime.AddMonths(1) < DateTime.Now) { Log.Info("发布时间在1月前不入库 appid=" + subItem.app_id + " pubtime=" + subItem.publish_at + " title=" + subItem.title); continue; } var newsId = DalNews.IsExistsNews_Bjh(authorId, subItem.title); if (newsId < 1) { #region === 不存在的插入=== var model = new DtoNews() { Author = "", AuthorId = authorId, Contents = "", CreateTime = DateTime.Now, //CurReadTimes = Global.ToInt(subItem.read_amount), CurReadTimes = subItem.read_amount, FromSiteName = "baijiahao", FromUrl = subItem.url, IntervalMinutes = 60, IsDeal = 0, IsHot = 0, IsShow = 1, LastDealTime = DateTime.Now, LastReadTimes = subItem.read_amount, LogoOriginalUrl = subItem.url, LogoUrl = "", NewsHotClass = 7, NewsTypeId = (int)NewsTypeEnum.新闻, PubTime = Comm.Tools.Utility.StringConverter.ToDateTime(subItem.publish_at), Tags = subItem.tag, Title = subItem.title, TotalComments = subItem.comment_amount, RefreshTimes = 0, GroupId = subItem.app_id }; DalNews.Insert_News_Bjh(model); #endregion } else { #region === 存在的则更新数据 === var oldNews = DalNews.GetNews_Bjh(newsId); if (oldNews != null) { //b、变化数据,如果是当天发稿的文章,每15分钟刷新一次阅读量,如果5、6、7级,则改为小时更新; //7天内发稿的文章,每一小时更新一次阅读数; //7天以上,每天刷新; //(这个可以按欢迎度级别优化,如15分钟阅读增加在10000以上为1级,5000以上为2级,2500以上为3级,1000以上为4级,500以上为5级,100以上为6级,100以下为7级) var isHot = 0; var minutes = (DateTime.Now - oldNews.LastDealTime).TotalMinutes; var newsClassId = 7; var addReads = subItem.read_amount - oldNews.CurReadTimes; var intervalMinutes = 24 * 60; if (addReads > 0) { if (minutes > 60) { var perHourReads = addReads / (minutes / 60.0); if (perHourReads > 10000) { isHot = 1; } } else { if (addReads > 10000) { isHot = 1; } } #region === 15分钟阅读量分析 === var per15MinutesReads = addReads / (minutes / 15.0); if (per15MinutesReads > 10000) { newsClassId = 1; isHot = 1; intervalMinutes = 15; } else if (per15MinutesReads > 5000) { newsClassId = 2; isHot = 1; intervalMinutes = 15; } else if (per15MinutesReads > 2500) { newsClassId = 3; intervalMinutes = 15; } else if (per15MinutesReads > 1000) { newsClassId = 4; intervalMinutes = 15; } else if (per15MinutesReads > 500) { newsClassId = 5; intervalMinutes = 60; } else if (per15MinutesReads > 100) { newsClassId = 6; intervalMinutes = 60; } else { newsClassId = 7; intervalMinutes = 60; } #endregion } if (oldNews.PubTime.AddHours(24) < DateTime.Now) { //不是今天发布的 intervalMinutes = 24 * 60; } //如果原来是爆文的不修改ishot if (oldNews.IsHot == 1) { isHot = 1; } if (oldNews.NewsHotClass < newsClassId) { newsClassId = oldNews.NewsHotClass; } var model = new DtoNews() { Id = newsId, LastReadTimes = oldNews.CurReadTimes, CurReadTimes = subItem.read_amount, IsHot = isHot, IsDeal = 1, TotalComments = subItem.comment_amount, IntervalMinutes = intervalMinutes, NewsHotClass = newsClassId, LastDealTime = DateTime.Now, }; DalNews.UpdateNews_Bjh(model); //暂不更新作者表的刷新时间,没用上 //DalNews.UpdateAuthorInterval(authorId, intervalMinutes); } #endregion } } catch (Exception ex) { } } } #endregion //Random rnd = new Random(); //有更多数据,则继续抓取数据 if (isHaveMore) { //sleep //Thread.Sleep(rnd.Next(1000, 2500)); Thread.Sleep(200); AuthorPageIndex++; DealAuthorData(url, authorId, groupId, AuthorPageIndex); } else { Log.Info("本百家号抓取结束总页数" + AuthorPageIndex); //置位状态 //DalNews.UpdateAuthorIsDeal(authorId, 1); AuthorPageIndex = 0; //Thread.Sleep(rnd.Next(2000, 5000)); Thread.Sleep(200); } } else { Log.Info(url + " 百家号未取到数据 页码" + AuthorPageIndex); } } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); Log.Debug("======strContent begin 百家号抓取========="); Log.Debug(url); Log.Debug(strContent); Log.Debug("======strContent end ========="); } return(1); }
/// <summary> /// 根据搜索关键字搜索百家号的文章的url,再从文章取作者的url /// </summary> /// <param name="newsListUrl"></param> /// <param name="newsType"></param> /// <returns></returns> public int GatheringAuthorUrlFromSearch2(string keywords, int newsType, int searchPageIndex) { if (string.IsNullOrWhiteSpace(keywords)) { return(0); } //百家号地址计数器,如果当前搜索页百家号地址小于2则不再读取下一页数据 var iBjhCount = 0; //有效的百家号计数器 var iHaveValidBjh = 0; //每次循环没有百家号计数 var iContinueNo = 0; var strContent = ""; //贡献文章 总阅读数 作者文章 按时间 //keywords = keywords.Replace("贡献文章", "\"贡献文章\""); //keywords = keywords.Replace("总阅读数", "\"总阅读数\""); //keywords = keywords.Replace("作者文章", "\"作者文章\""); //keywords = keywords.Replace("按时间", "\"按时间\""); keywords = keywords.Replace("贡献文章 ", ""); keywords = keywords.Replace("贡献文章", ""); keywords = keywords.Replace("总阅读数 ", ""); keywords = keywords.Replace("总阅读数", ""); keywords = keywords.Replace("作者文章 ", ""); keywords = keywords.Replace("作者文章", ""); keywords = keywords.Replace("按时间", ""); //用来记录搜索关键字 var groupid = keywords; if (groupid.Length > 50) { groupid = groupid.Substring(0, 30); } //keywords = keywords.Replace(" ","").Replace("\\","").Replace("%20",""); keywords = keywords.Replace(" ", "%20"); //keywords = System.Web.HttpUtility.UrlEncode(keywords); //var site = "%20site%3Abaijiahao.baidu.com"; var inurl = "inurl%3Abaijiahao.baidu.com%20\"本文系作者授权百家号发表\""; var url = "https://www.baidu.com/s?wd=" + keywords + inurl; try { if (searchPageIndex > 0) { url += "&pn=" + searchPageIndex * 10; } Log.Info(url + " 搜索 页码" + searchPageIndex); #region === 取内容 === strContent = HttpHelper.GetContent(url, Encoding.UTF8); if (string.IsNullOrWhiteSpace(strContent)) { Thread.Sleep(1 * 1000); //重新请求一次,因为用了代理后,经常会失败 strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); if (string.IsNullOrWhiteSpace(strContent)) { //HttpHelper.IsUseProxy = false; //重新请求一次,因为用了代理后,经常会失败 Thread.Sleep(1 * 1000); strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); //HttpHelper.IsUseProxy = true; if (string.IsNullOrWhiteSpace(strContent)) { Log.Info(url + " 未抓取到任何内容 页码" + searchPageIndex); } } } #endregion //Log.Info("===========begin =============="+url + " " + searchPageIndex); //Log.Info(strContent); //Log.Info("===========end ==============" + url + " " + searchPageIndex); #region === deal baijiahao === if (!string.IsNullOrWhiteSpace(strContent)) { var lista = XpathHelper.GetOuterHtmlListByXPath(strContent, "//div[@class='f13']/a[1]"); if (lista != null && lista.Count > 0) { iBjhCount = 0; iHaveValidBjh = 0; foreach (var a in lista) { var href = XpathHelper.GetAttrValueByXPath(a, "//a", "href"); #region === deal baijiahao news url === Thread.Sleep(1 * 1000); var str = HttpHelper.GetContentByAgent(href, Encoding.UTF8); if (string.IsNullOrWhiteSpace(str)) { str = HttpHelper.GetContent(href, Encoding.UTF8); } //取百家号主页里的百家号名称,appid var author = ""; var appId = ""; if (!string.IsNullOrWhiteSpace(str)) { try { author = XpathHelper.GetInnerHtmlByXPath(str, "//div[@class='author-detail']/a/p", "").Replace("-百家号", ""); appId = XpathHelper.GetAttrValueByXPath(str, "//div[@class='author-detail']/a", "href"); //u?app_id=1546166210605725&fr=bjhvideo&wfr=spider if (!string.IsNullOrWhiteSpace(appId)) { var str2 = appId.Split('='); appId = str2[1].Replace("&fr", ""); } else { var iIndex = str.IndexOf("\"app_id\":"); if (iIndex > 0) { appId = str.Substring(iIndex + 10, 19).Replace("\",\"type", "").Replace("\"", "").Replace(",", "").Replace("type", ""); } else { #region === 重新取内容处理 === Thread.Sleep(1 * 1000); str = HttpHelper.GetContent(href, Encoding.UTF8); if (string.IsNullOrWhiteSpace(str)) { str = HttpHelper.GetContentByAgent(href, Encoding.UTF8); } if (!string.IsNullOrWhiteSpace(str)) { try { author = XpathHelper.GetInnerHtmlByXPath(str, "//div[@class='author-detail']/a/p", "").Replace("-百家号", ""); appId = XpathHelper.GetAttrValueByXPath(str, "//div[@class='author-detail']/a", "href"); //u?app_id=1546166210605725&fr=bjhvideo&wfr=spider if (!string.IsNullOrWhiteSpace(appId)) { var str2 = appId.Split('='); appId = str2[1].Replace("&fr", ""); } else { iIndex = str.IndexOf("\"app_id\":"); if (iIndex > 0) { appId = str.Substring(iIndex + 10, 19).Replace("\",\"type", "").Replace("\"", "").Replace(",", "").Replace("type", ""); } else { } } } catch { } } #endregion } } } catch (Exception ex) { } } else { Log.Info("取百家号主页内容没取到 href=" + href); } if (string.IsNullOrWhiteSpace(appId)) { Log.Info("appid没取到 内容如下=== begin === href=" + href); //Log.Info(str); Log.Info("appid没取到 内容如下=== end === href" + href); continue; } #region === 判断是否已存在 === var isHave = DalNews.IsExistsAuthor_Bjh(appId); if (!isHave) { iHaveValidBjh++; var model = new DtoAuthor() { Author = author, AuthorId = appId, GroupId = groupid, IntervalMinutes = 60, IsDeal = 0, IsShow = 0, LastDealTime = DateTime.Now, RefreshTimes = 0, Url = "http://baijiahao.baidu.com/u?app_id=" + appId, }; var id = DalNews.Insert_Author_Bjh(model); Log.Info("keyword" + keywords + "authodid=" + id); } else { //iHaveValidBjh = 0; Log.Info("appid" + appId + "已存在"); } #endregion #endregion } } } else { Log.Error("url=" + url + " 无内容" + DateTime.Now); } #endregion //如果当前页有百家号>=3则翻页,否则结束 if (iBjhCount >= 3) { //当翻页到后面且没有新的百家号时退出,不再翻页 if (iHaveValidBjh < 1 && searchPageIndex > 30) { return(0); } searchPageIndex++; GatheringAuthorUrlFromSearch(keywords, newsType, searchPageIndex); } } catch (Exception ex) { Log.Error("url=" + url + " " + DateTime.Now); Log.Error(ex.Message + ex.StackTrace); } return(0); }
public void ImgDealing() { try { Log.Info("图片处理开始"); var listNews = DalNews.GetNoDealNewsList(); if (listNews == null || listNews.Count < 1) { Log.Info("没有要处理的图片" + DateTime.Now); return; } if (listNews.Count > 0) { foreach (var item in listNews) { try { Log.Info("===== newsid= " + item.Id + " 处理开始 ====="); var newsId = item.Id; var savePath = Global.ImgSavePrex + Global.ImgSaveSuffix + newsId + "\\"; #region === 处理logo图 === Log.Info("处理logo图:" + item.LogoUrl); var logUrl = item.LogoUrl; var fileName = Img.NetImgSaveAs(logUrl, savePath); //处理成功更新字段 if (!string.IsNullOrWhiteSpace(fileName)) { var dbSavePath = Global.ImgSaveSuffix + newsId + "\\" + fileName; var model = new T_News() { LogoUrl = dbSavePath, //ImgFlag =1, Id = newsId }; //model.Update(); } #endregion #region === 处理内容中的图 === //暂不处理内容区,内容区的处理后用了相对地址,取出时无法加上域名前缀(不方便) //Log.Info("处理内容区:" + item.LogoUrl); //var content = item.Contents; //var imgList = StrHelper.GetAttrStrListByXPath(content, "//img","src"); //if (imgList != null && imgList.Count > 0) //{ // foreach (var img in imgList) // { // try // { // if (img.Length < 1) // continue; // var nfileName = Img.NetImgSaveAs(img, savePath); // //替换内容区中的该图片链接 // if (!string.IsNullOrWhiteSpace(nfileName)) // { // var dbSavePath = Global.ImgSaveSuffix + newsId + "\\" + nfileName; // content = content.Replace(img, dbSavePath); // } // } // catch (Exception ex) // { // Log.Error("处理异常: " + img); // Log.Error(ex.Message + ex.StackTrace); // } // } // //处理成功更新字段 // var model = new T_News() // { // Contents = content, // Id = newsId // }; // model.Update(); //} #endregion #region === 处理图片类型下的多图部分 === if (item.NewsMedia != null && item.NewsMedia.Count > 0) { foreach (var subItem in item.NewsMedia) { var picUrl = subItem.PicUrl; var nfileName = Img.NetImgSaveAs(picUrl, savePath); //处理成功更新字段 if (!string.IsNullOrWhiteSpace(nfileName)) { var dbSavePath = Global.ImgSaveSuffix + newsId + "\\" + nfileName; //var model = new T_NewsMedia() //{ // PicUrl = dbSavePath, // Id = subItem.Id //}; //model.Update(); } } } #endregion Log.Info("===== newsid= " + item.Id + " 处理完成 ====="); //休眠 控制抓取的频率 Random rnd = new Random(); var sleepSeconds = rnd.Next(30, 90); Thread.Sleep(sleepSeconds * 1000); } catch (Exception ex) { Log.Error("内容: " + item); Log.Error(ex.Message + ex.StackTrace); } } } Log.Info("图片处理结束"); } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); } }
public List <DtoNewsUrlList> NewsUrlGathering(string newsListUrl, int newsType) { try { Log.Info(newsListUrl + " 抓取开始"); var strContent = HttpHelper.GetContentByMobileAgent(newsListUrl, Encoding.GetEncoding("gb2312")); if (string.IsNullOrWhiteSpace(strContent)) { Log.Info(newsListUrl + " 未抓取到任何内容"); return(null); } //取得标题列表 var strList = XpathHelper.GetInnerHtmlListByXPath(strContent, "//div[@class='leftList']/ul/li"); if (strList != null && strList.Count > 0) { foreach (var item in strList) { try { var url = XpathHelper.GetAttrValueByXPath(item, "//a", "href"); var title = XpathHelper.GetInnerHtmlByXPath(item, "//a", ""); title = StrHelper.FormatHtml(title).Trim(); var isHave = DalNews.IsExistsNews(title); //如果已存在则跳过 if (isHave) { continue; } if (newsType == 100 || newsType == 200 || newsType == 300) { #region === 根据详细页地址取新闻内容 === var news = NewsGathering(url); if (news != null) { news.NewsTypeId = newsType; news.Title = title; news.PubTime = StrHelper.ToDateTime(StrHelper.FormatPubTime(news.PubTime.ToString())); //入库 var newsId = DalNews.Insert(news); if (newsId < 1) { continue; } //从内容中提取img,存入newsmedia var mediaList = ImgDeal.GetImgList(news.Contents); if (mediaList != null && mediaList.Count > 0) { news.Contents = mediaList[0].Description; foreach (var picitem in mediaList) { picitem.NewsId = newsId; DalNews.InsertMedia(picitem); } } //休眠 控制抓取的频率 Random rnd = new Random(); var sleepSeconds = rnd.Next(30, 90); Thread.Sleep(sleepSeconds * 1000); } #endregion } if (newsType == 400) { #region === 根据详细页地址取图片内容 === var mediaList = NewsPicGathering(url); var news = new DtoNews() { Title = title, FromUrl = url, NewsTypeId = newsType, }; if (mediaList != null && mediaList.Count > 0) { news.Contents = mediaList[0].Description; //入库 var newsId = DalNews.Insert(news); foreach (var picitem in mediaList) { picitem.NewsId = newsId; DalNews.InsertMedia(picitem); } } //休眠 控制抓取的频率 Random rnd = new Random(); var sleepSeconds = rnd.Next(30, 90); Thread.Sleep(sleepSeconds * 1000); #endregion } Log.Info(url + " 抓取完成"); } catch (Exception ex) { //Log.Error("内容: " + item); Log.Error(ex.Message + ex.StackTrace); } } } Log.Info(newsListUrl + " 抓取结束"); return(null); } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); } return(null); }