/// <summary> /// 根据搜索关键字搜索百家号的文章的url,再从文章取作者的url /// </summary> /// <param name="newsListUrl"></param> /// <param name="newsType"></param> /// <returns></returns> public int GatheringAuthorUrlFromSearch2(string keywords, int newsType, int searchPageIndex) { if (string.IsNullOrWhiteSpace(keywords)) { return(0); } //百家号地址计数器,如果当前搜索页百家号地址小于2则不再读取下一页数据 var iBjhCount = 0; //有效的百家号计数器 var iHaveValidBjh = 0; //每次循环没有百家号计数 var iContinueNo = 0; var strContent = ""; //贡献文章 总阅读数 作者文章 按时间 //keywords = keywords.Replace("贡献文章", "\"贡献文章\""); //keywords = keywords.Replace("总阅读数", "\"总阅读数\""); //keywords = keywords.Replace("作者文章", "\"作者文章\""); //keywords = keywords.Replace("按时间", "\"按时间\""); keywords = keywords.Replace("贡献文章 ", ""); keywords = keywords.Replace("贡献文章", ""); keywords = keywords.Replace("总阅读数 ", ""); keywords = keywords.Replace("总阅读数", ""); keywords = keywords.Replace("作者文章 ", ""); keywords = keywords.Replace("作者文章", ""); keywords = keywords.Replace("按时间", ""); //用来记录搜索关键字 var groupid = keywords; if (groupid.Length > 50) { groupid = groupid.Substring(0, 30); } //keywords = keywords.Replace(" ","").Replace("\\","").Replace("%20",""); keywords = keywords.Replace(" ", "%20"); //keywords = System.Web.HttpUtility.UrlEncode(keywords); //var site = "%20site%3Abaijiahao.baidu.com"; var inurl = "inurl%3Abaijiahao.baidu.com%20\"本文系作者授权百家号发表\""; var url = "https://www.baidu.com/s?wd=" + keywords + inurl; try { if (searchPageIndex > 0) { url += "&pn=" + searchPageIndex * 10; } Log.Info(url + " 搜索 页码" + searchPageIndex); #region === 取内容 === strContent = HttpHelper.GetContent(url, Encoding.UTF8); if (string.IsNullOrWhiteSpace(strContent)) { Thread.Sleep(1 * 1000); //重新请求一次,因为用了代理后,经常会失败 strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); if (string.IsNullOrWhiteSpace(strContent)) { //HttpHelper.IsUseProxy = false; //重新请求一次,因为用了代理后,经常会失败 Thread.Sleep(1 * 1000); strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8); //HttpHelper.IsUseProxy = true; if (string.IsNullOrWhiteSpace(strContent)) { Log.Info(url + " 未抓取到任何内容 页码" + searchPageIndex); } } } #endregion //Log.Info("===========begin =============="+url + " " + searchPageIndex); //Log.Info(strContent); //Log.Info("===========end ==============" + url + " " + searchPageIndex); #region === deal baijiahao === if (!string.IsNullOrWhiteSpace(strContent)) { var lista = XpathHelper.GetOuterHtmlListByXPath(strContent, "//div[@class='f13']/a[1]"); if (lista != null && lista.Count > 0) { iBjhCount = 0; iHaveValidBjh = 0; foreach (var a in lista) { var href = XpathHelper.GetAttrValueByXPath(a, "//a", "href"); #region === deal baijiahao news url === Thread.Sleep(1 * 1000); var str = HttpHelper.GetContentByAgent(href, Encoding.UTF8); if (string.IsNullOrWhiteSpace(str)) { str = HttpHelper.GetContent(href, Encoding.UTF8); } //取百家号主页里的百家号名称,appid var author = ""; var appId = ""; if (!string.IsNullOrWhiteSpace(str)) { try { author = XpathHelper.GetInnerHtmlByXPath(str, "//div[@class='author-detail']/a/p", "").Replace("-百家号", ""); appId = XpathHelper.GetAttrValueByXPath(str, "//div[@class='author-detail']/a", "href"); //u?app_id=1546166210605725&fr=bjhvideo&wfr=spider if (!string.IsNullOrWhiteSpace(appId)) { var str2 = appId.Split('='); appId = str2[1].Replace("&fr", ""); } else { var iIndex = str.IndexOf("\"app_id\":"); if (iIndex > 0) { appId = str.Substring(iIndex + 10, 19).Replace("\",\"type", "").Replace("\"", "").Replace(",", "").Replace("type", ""); } else { #region === 重新取内容处理 === Thread.Sleep(1 * 1000); str = HttpHelper.GetContent(href, Encoding.UTF8); if (string.IsNullOrWhiteSpace(str)) { str = HttpHelper.GetContentByAgent(href, Encoding.UTF8); } if (!string.IsNullOrWhiteSpace(str)) { try { author = XpathHelper.GetInnerHtmlByXPath(str, "//div[@class='author-detail']/a/p", "").Replace("-百家号", ""); appId = XpathHelper.GetAttrValueByXPath(str, "//div[@class='author-detail']/a", "href"); //u?app_id=1546166210605725&fr=bjhvideo&wfr=spider if (!string.IsNullOrWhiteSpace(appId)) { var str2 = appId.Split('='); appId = str2[1].Replace("&fr", ""); } else { iIndex = str.IndexOf("\"app_id\":"); if (iIndex > 0) { appId = str.Substring(iIndex + 10, 19).Replace("\",\"type", "").Replace("\"", "").Replace(",", "").Replace("type", ""); } else { } } } catch { } } #endregion } } } catch (Exception ex) { } } else { Log.Info("取百家号主页内容没取到 href=" + href); } if (string.IsNullOrWhiteSpace(appId)) { Log.Info("appid没取到 内容如下=== begin === href=" + href); //Log.Info(str); Log.Info("appid没取到 内容如下=== end === href" + href); continue; } #region === 判断是否已存在 === var isHave = DalNews.IsExistsAuthor_Bjh(appId); if (!isHave) { iHaveValidBjh++; var model = new DtoAuthor() { Author = author, AuthorId = appId, GroupId = groupid, IntervalMinutes = 60, IsDeal = 0, IsShow = 0, LastDealTime = DateTime.Now, RefreshTimes = 0, Url = "http://baijiahao.baidu.com/u?app_id=" + appId, }; var id = DalNews.Insert_Author_Bjh(model); Log.Info("keyword" + keywords + "authodid=" + id); } else { //iHaveValidBjh = 0; Log.Info("appid" + appId + "已存在"); } #endregion #endregion } } } else { Log.Error("url=" + url + " 无内容" + DateTime.Now); } #endregion //如果当前页有百家号>=3则翻页,否则结束 if (iBjhCount >= 3) { //当翻页到后面且没有新的百家号时退出,不再翻页 if (iHaveValidBjh < 1 && searchPageIndex > 30) { return(0); } searchPageIndex++; GatheringAuthorUrlFromSearch(keywords, newsType, searchPageIndex); } } catch (Exception ex) { Log.Error("url=" + url + " " + DateTime.Now); Log.Error(ex.Message + ex.StackTrace); } return(0); }
public List <DtoNewsUrlList> NewsUrlGathering(string newsListUrl, int newsType) { try { Log.Info(newsListUrl + " 抓取开始"); var strContent = HttpHelper.GetContentByMobileAgent(newsListUrl, Encoding.GetEncoding("gb2312")); if (string.IsNullOrWhiteSpace(strContent)) { Log.Info(newsListUrl + " 未抓取到任何内容"); return(null); } //取得标题列表 var strList = XpathHelper.GetInnerHtmlListByXPath(strContent, "//div[@class='leftList']/ul/li"); if (strList != null && strList.Count > 0) { foreach (var item in strList) { try { var url = XpathHelper.GetAttrValueByXPath(item, "//a", "href"); var title = XpathHelper.GetInnerHtmlByXPath(item, "//a", ""); title = StrHelper.FormatHtml(title).Trim(); var isHave = DalNews.IsExistsNews(title); //如果已存在则跳过 if (isHave) { continue; } if (newsType == 100 || newsType == 200 || newsType == 300) { #region === 根据详细页地址取新闻内容 === var news = NewsGathering(url); if (news != null) { news.NewsTypeId = newsType; news.Title = title; news.PubTime = StrHelper.ToDateTime(StrHelper.FormatPubTime(news.PubTime.ToString())); //入库 var newsId = DalNews.Insert(news); if (newsId < 1) { continue; } //从内容中提取img,存入newsmedia var mediaList = ImgDeal.GetImgList(news.Contents); if (mediaList != null && mediaList.Count > 0) { news.Contents = mediaList[0].Description; foreach (var picitem in mediaList) { picitem.NewsId = newsId; DalNews.InsertMedia(picitem); } } //休眠 控制抓取的频率 Random rnd = new Random(); var sleepSeconds = rnd.Next(30, 90); Thread.Sleep(sleepSeconds * 1000); } #endregion } if (newsType == 400) { #region === 根据详细页地址取图片内容 === var mediaList = NewsPicGathering(url); var news = new DtoNews() { Title = title, FromUrl = url, NewsTypeId = newsType, }; if (mediaList != null && mediaList.Count > 0) { news.Contents = mediaList[0].Description; //入库 var newsId = DalNews.Insert(news); foreach (var picitem in mediaList) { picitem.NewsId = newsId; DalNews.InsertMedia(picitem); } } //休眠 控制抓取的频率 Random rnd = new Random(); var sleepSeconds = rnd.Next(30, 90); Thread.Sleep(sleepSeconds * 1000); #endregion } Log.Info(url + " 抓取完成"); } catch (Exception ex) { //Log.Error("内容: " + item); Log.Error(ex.Message + ex.StackTrace); } } } Log.Info(newsListUrl + " 抓取结束"); return(null); } catch (Exception ex) { Log.Error(ex.Message + ex.StackTrace); } return(null); }