Пример #1
0
        /// <summary>
        /// 根据搜索关键字搜索百家号的文章的url,再从文章取作者的url
        /// </summary>
        /// <param name="newsListUrl"></param>
        /// <param name="newsType"></param>
        /// <returns></returns>
        public int GatheringAuthorUrlFromSearch2(string keywords, int newsType, int searchPageIndex)
        {
            if (string.IsNullOrWhiteSpace(keywords))
            {
                return(0);
            }
            //百家号地址计数器,如果当前搜索页百家号地址小于2则不再读取下一页数据
            var iBjhCount = 0;
            //有效的百家号计数器
            var iHaveValidBjh = 0;
            //每次循环没有百家号计数
            var iContinueNo = 0;
            var strContent  = "";

            //贡献文章 总阅读数 作者文章 按时间
            //keywords = keywords.Replace("贡献文章", "\"贡献文章\"");
            //keywords = keywords.Replace("总阅读数", "\"总阅读数\"");
            //keywords = keywords.Replace("作者文章", "\"作者文章\"");
            //keywords = keywords.Replace("按时间", "\"按时间\"");

            keywords = keywords.Replace("贡献文章 ", "");
            keywords = keywords.Replace("贡献文章", "");
            keywords = keywords.Replace("总阅读数 ", "");
            keywords = keywords.Replace("总阅读数", "");
            keywords = keywords.Replace("作者文章 ", "");
            keywords = keywords.Replace("作者文章", "");
            keywords = keywords.Replace("按时间", "");

            //用来记录搜索关键字
            var groupid = keywords;

            if (groupid.Length > 50)
            {
                groupid = groupid.Substring(0, 30);
            }
            //keywords = keywords.Replace(" ","").Replace("\\","").Replace("%20","");
            keywords = keywords.Replace(" ", "%20");
            //keywords = System.Web.HttpUtility.UrlEncode(keywords);

            //var site = "%20site%3Abaijiahao.baidu.com";
            var inurl = "inurl%3Abaijiahao.baidu.com%20\"本文系作者授权百家号发表\"";

            var url = "https://www.baidu.com/s?wd=" + keywords + inurl;

            try
            {
                if (searchPageIndex > 0)
                {
                    url += "&pn=" + searchPageIndex * 10;
                }
                Log.Info(url + " 搜索 页码" + searchPageIndex);

                #region === 取内容 ===
                strContent = HttpHelper.GetContent(url, Encoding.UTF8);
                if (string.IsNullOrWhiteSpace(strContent))
                {
                    Thread.Sleep(1 * 1000);
                    //重新请求一次,因为用了代理后,经常会失败
                    strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8);
                    if (string.IsNullOrWhiteSpace(strContent))
                    {
                        //HttpHelper.IsUseProxy = false;
                        //重新请求一次,因为用了代理后,经常会失败
                        Thread.Sleep(1 * 1000);
                        strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8);
                        //HttpHelper.IsUseProxy = true;
                        if (string.IsNullOrWhiteSpace(strContent))
                        {
                            Log.Info(url + " 未抓取到任何内容 页码" + searchPageIndex);
                        }
                    }
                }
                #endregion

                //Log.Info("===========begin =============="+url + " " + searchPageIndex);

                //Log.Info(strContent);

                //Log.Info("===========end ==============" + url + " " + searchPageIndex);



                #region === deal baijiahao ===
                if (!string.IsNullOrWhiteSpace(strContent))
                {
                    var lista = XpathHelper.GetOuterHtmlListByXPath(strContent, "//div[@class='f13']/a[1]");
                    if (lista != null && lista.Count > 0)
                    {
                        iBjhCount     = 0;
                        iHaveValidBjh = 0;
                        foreach (var a in lista)
                        {
                            var href = XpathHelper.GetAttrValueByXPath(a, "//a", "href");

                            #region === deal baijiahao news url ===
                            Thread.Sleep(1 * 1000);

                            var str = HttpHelper.GetContentByAgent(href, Encoding.UTF8);
                            if (string.IsNullOrWhiteSpace(str))
                            {
                                str = HttpHelper.GetContent(href, Encoding.UTF8);
                            }
                            //取百家号主页里的百家号名称,appid
                            var author = "";
                            var appId  = "";
                            if (!string.IsNullOrWhiteSpace(str))
                            {
                                try
                                {
                                    author = XpathHelper.GetInnerHtmlByXPath(str, "//div[@class='author-detail']/a/p", "").Replace("-百家号", "");
                                    appId  = XpathHelper.GetAttrValueByXPath(str, "//div[@class='author-detail']/a", "href");
                                    //u?app_id=1546166210605725&fr=bjhvideo&wfr=spider
                                    if (!string.IsNullOrWhiteSpace(appId))
                                    {
                                        var str2 = appId.Split('=');
                                        appId = str2[1].Replace("&fr", "");
                                    }
                                    else
                                    {
                                        var iIndex = str.IndexOf("\"app_id\":");
                                        if (iIndex > 0)
                                        {
                                            appId = str.Substring(iIndex + 10, 19).Replace("\",\"type", "").Replace("\"", "").Replace(",", "").Replace("type", "");
                                        }
                                        else
                                        {
                                            #region === 重新取内容处理 ===
                                            Thread.Sleep(1 * 1000);

                                            str = HttpHelper.GetContent(href, Encoding.UTF8);
                                            if (string.IsNullOrWhiteSpace(str))
                                            {
                                                str = HttpHelper.GetContentByAgent(href, Encoding.UTF8);
                                            }

                                            if (!string.IsNullOrWhiteSpace(str))
                                            {
                                                try
                                                {
                                                    author = XpathHelper.GetInnerHtmlByXPath(str, "//div[@class='author-detail']/a/p", "").Replace("-百家号", "");
                                                    appId  = XpathHelper.GetAttrValueByXPath(str, "//div[@class='author-detail']/a", "href");
                                                    //u?app_id=1546166210605725&fr=bjhvideo&wfr=spider
                                                    if (!string.IsNullOrWhiteSpace(appId))
                                                    {
                                                        var str2 = appId.Split('=');
                                                        appId = str2[1].Replace("&fr", "");
                                                    }
                                                    else
                                                    {
                                                        iIndex = str.IndexOf("\"app_id\":");
                                                        if (iIndex > 0)
                                                        {
                                                            appId = str.Substring(iIndex + 10, 19).Replace("\",\"type", "").Replace("\"", "").Replace(",", "").Replace("type", "");
                                                        }
                                                        else
                                                        {
                                                        }
                                                    }
                                                }
                                                catch { }
                                            }
                                            #endregion
                                        }
                                    }
                                }
                                catch (Exception ex)
                                { }
                            }
                            else
                            {
                                Log.Info("取百家号主页内容没取到 href=" + href);
                            }
                            if (string.IsNullOrWhiteSpace(appId))
                            {
                                Log.Info("appid没取到 内容如下=== begin === href=" + href);
                                //Log.Info(str);
                                Log.Info("appid没取到 内容如下=== end === href" + href);
                                continue;
                            }
                            #region === 判断是否已存在 ===
                            var isHave = DalNews.IsExistsAuthor_Bjh(appId);
                            if (!isHave)
                            {
                                iHaveValidBjh++;
                                var model = new DtoAuthor()
                                {
                                    Author          = author,
                                    AuthorId        = appId,
                                    GroupId         = groupid,
                                    IntervalMinutes = 60,
                                    IsDeal          = 0,
                                    IsShow          = 0,
                                    LastDealTime    = DateTime.Now,
                                    RefreshTimes    = 0,
                                    Url             = "http://baijiahao.baidu.com/u?app_id=" + appId,
                                };
                                var id = DalNews.Insert_Author_Bjh(model);
                                Log.Info("keyword" + keywords + "authodid=" + id);
                            }
                            else
                            {
                                //iHaveValidBjh = 0;
                                Log.Info("appid" + appId + "已存在");
                            }
                            #endregion


                            #endregion
                        }
                    }
                }
                else
                {
                    Log.Error("url=" + url + " 无内容" + DateTime.Now);
                }
                #endregion

                //如果当前页有百家号>=3则翻页,否则结束
                if (iBjhCount >= 3)
                {
                    //当翻页到后面且没有新的百家号时退出,不再翻页
                    if (iHaveValidBjh < 1 && searchPageIndex > 30)
                    {
                        return(0);
                    }
                    searchPageIndex++;
                    GatheringAuthorUrlFromSearch(keywords, newsType, searchPageIndex);
                }
            }
            catch (Exception ex)
            {
                Log.Error("url=" + url + " " + DateTime.Now);
                Log.Error(ex.Message + ex.StackTrace);
            }
            return(0);
        }
Пример #2
0
        public List <DtoNewsUrlList> NewsUrlGathering(string newsListUrl, int newsType)
        {
            try
            {
                Log.Info(newsListUrl + " 抓取开始");
                var strContent = HttpHelper.GetContentByMobileAgent(newsListUrl, Encoding.GetEncoding("gb2312"));
                if (string.IsNullOrWhiteSpace(strContent))
                {
                    Log.Info(newsListUrl + " 未抓取到任何内容");
                    return(null);
                }

                //取得标题列表
                var strList = XpathHelper.GetInnerHtmlListByXPath(strContent, "//div[@class='leftList']/ul/li");

                if (strList != null && strList.Count > 0)
                {
                    foreach (var item in strList)
                    {
                        try
                        {
                            var url   = XpathHelper.GetAttrValueByXPath(item, "//a", "href");
                            var title = XpathHelper.GetInnerHtmlByXPath(item, "//a", "");
                            title = StrHelper.FormatHtml(title).Trim();


                            var isHave = DalNews.IsExistsNews(title);
                            //如果已存在则跳过
                            if (isHave)
                            {
                                continue;
                            }

                            if (newsType == 100 || newsType == 200 || newsType == 300)
                            {
                                #region === 根据详细页地址取新闻内容 ===

                                var news = NewsGathering(url);
                                if (news != null)
                                {
                                    news.NewsTypeId = newsType;
                                    news.Title      = title;

                                    news.PubTime = StrHelper.ToDateTime(StrHelper.FormatPubTime(news.PubTime.ToString()));
                                    //入库
                                    var newsId = DalNews.Insert(news);
                                    if (newsId < 1)
                                    {
                                        continue;
                                    }

                                    //从内容中提取img,存入newsmedia
                                    var mediaList = ImgDeal.GetImgList(news.Contents);
                                    if (mediaList != null && mediaList.Count > 0)
                                    {
                                        news.Contents = mediaList[0].Description;

                                        foreach (var picitem in mediaList)
                                        {
                                            picitem.NewsId = newsId;
                                            DalNews.InsertMedia(picitem);
                                        }
                                    }

                                    //休眠 控制抓取的频率
                                    Random rnd          = new Random();
                                    var    sleepSeconds = rnd.Next(30, 90);
                                    Thread.Sleep(sleepSeconds * 1000);
                                }

                                #endregion
                            }
                            if (newsType == 400)
                            {
                                #region === 根据详细页地址取图片内容 ===

                                var mediaList = NewsPicGathering(url);

                                var news = new DtoNews()
                                {
                                    Title      = title,
                                    FromUrl    = url,
                                    NewsTypeId = newsType,
                                };
                                if (mediaList != null && mediaList.Count > 0)
                                {
                                    news.Contents = mediaList[0].Description;

                                    //入库
                                    var newsId = DalNews.Insert(news);

                                    foreach (var picitem in mediaList)
                                    {
                                        picitem.NewsId = newsId;
                                        DalNews.InsertMedia(picitem);
                                    }
                                }

                                //休眠 控制抓取的频率
                                Random rnd          = new Random();
                                var    sleepSeconds = rnd.Next(30, 90);
                                Thread.Sleep(sleepSeconds * 1000);

                                #endregion
                            }
                            Log.Info(url + " 抓取完成");
                        }
                        catch (Exception ex)
                        {
                            //Log.Error("内容: " + item);
                            Log.Error(ex.Message + ex.StackTrace);
                        }
                    }
                }
                Log.Info(newsListUrl + " 抓取结束");
                return(null);
            }
            catch (Exception ex)
            {
                Log.Error(ex.Message + ex.StackTrace);
            }
            return(null);
        }