コード例 #1
0
        public void SetUp()
        {
            news = new DtoNews()
            {
                Author       = "test",
                Contents     = DateTime.Now + "test",
                CreateTime   = DateTime.Now,
                FromSiteName = "testsite",
                FromUrl      = "http://localhost",
                //ImgFlag = 0,
                Title           = "test1" + DateTime.Now,
                LogoUrl         = "http://n.sinaimg.cn/fo/transform/20160705/pBto-fxtspsa6682768.jpg",
                NewsTypeId      = (int)NewsTypeEnum.新闻,
                PubTime         = DateTime.Now,
                IsShow          = 1,
                LogoOriginalUrl = "",
                AuthorId        = "",


                TotalComments   = 0,
                Tags            = "",
                NewsHotClass    = 7,
                LastReadTimes   = 0,
                LastDealTime    = DateTime.Now,
                IsHot           = 0,
                IsDeal          = 0,
                IntervalMinutes = 60,
                CurReadTimes    = 0,
            };
        }
コード例 #2
0
        public void InsertTest()
        {
            var id = DalNews.Insert(news);

            Assert.IsTrue(id > 0);

            var result = DalNews.GetNews(id);

            Assert.IsTrue(result.Id == id);

            var news2 = new DtoNews()
            {
                Id           = result.Id,
                CurReadTimes = 10000,
            };

            DalNews.UpdateNews(news2);

            var isHave = DalNews.IsExistsNews(result.Title);

            Assert.IsTrue(isHave);

            //DalNews.UpdateImgFlag(id, 1);
            //result = DalNews.GetNews(id);
            //Assert.IsTrue(result.ImgFlag == 1);

            var result2 = DalNews.DelNews(id);

            Assert.IsTrue(result2);
        }
コード例 #3
0
ファイル: DalNews.cs プロジェクト: thachgiasoft/Lfb.DataGrab
 public static int UpdateNews_Bjh(DtoNews model)
 {
     try
     {
         var news = new T_News_Bjh()
         {
             Id              = model.Id,
             CurReadTimes    = model.CurReadTimes,
             LastDealTime    = DateTime.Now,
             LastReadTimes   = model.LastReadTimes,
             IsHot           = model.IsHot,
             IsDeal          = 1,
             TotalComments   = model.TotalComments,
             NewsHotClass    = model.NewsHotClass,
             IntervalMinutes = model.IntervalMinutes,
             GroupId         = model.GroupId,
         };
         return(Sql.Update(news, "Id={0}".Formats(model.Id)));
     }
     catch (Exception ex)
     {
         Log.Error(ex.Message + ex.StackTrace);
     }
     return(1);
 }
コード例 #4
0
        public DtoNews NewsGathering(string newsUrl)
        {
            try
            {
                var title   = "";
                var content = "";
                var pubTime = "";
                var from    = "";
                var author  = "";

                var strNewContent = HttpHelper.GetContentByMobileAgent(newsUrl, Encoding.UTF8);

                content = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_real']", "").Trim();
                pubTime = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_sth']/p/span[1]", "");
                pubTime = StrHelper.FormatHtml(pubTime).Trim();
                from    = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_sth']/p/span[3]/span", "");
                from    = StrHelper.FormatHtml(from).Trim();
                author  = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_sth']/p/span[4]/span", "");
                author  = StrHelper.FormatHtml(author).Trim();
                if (string.IsNullOrWhiteSpace(from))
                {
                    from = "凤凰佛教";
                }
                content = DealContent(content);

                var news = new DtoNews
                {
                    Contents     = content,
                    Title        = title,
                    PubTime      = StrHelper.ToDateTime(pubTime),
                    FromUrl      = newsUrl,
                    FromSiteName = from,
                    Author       = author,
                    CreateTime   = DateTime.Now,
                    IsShow       = 0,
                };
                return(news);
            }
            catch (Exception ex)
            {
                Log.Error(newsUrl + " 错误:" + ex.Message + ex.StackTrace);
            }
            return(null);
        }
コード例 #5
0
        public int DealAuthorData(string url, string authorId, string groupId, int AuthorPageIndex)
        {
            //var url = "http://baijiahao.baidu.com/api/content/article/listall?sk=super&ak=super&app_id={0}&_skip={1}&_limit=12";
            var skip = AuthorPageIndex * 12;

            if (skip == 0)
            {
                url = string.Format(url, authorId, skip);
            }
            else
            {
                url = url.Replace((skip - 12).ToString(), skip.ToString());
            }

            var strContent = "";

            try
            {
                Log.Info(url + " 百家号抓取开始 页码" + AuthorPageIndex);
                strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8);
                if (string.IsNullOrWhiteSpace(strContent))
                {
                    //重新请求一次,因为用了代理后,经常会失败
                    strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8);
                    if (string.IsNullOrWhiteSpace(strContent))
                    {
                        //HttpHelper.IsUseProxy = false;
                        //重新请求一次,因为用了代理后,经常会失败
                        strContent = HttpHelper.GetContentByAgent(url, Encoding.UTF8);
                        //HttpHelper.IsUseProxy = true;
                        if (string.IsNullOrWhiteSpace(strContent))
                        {
                            Log.Info(url + " 未抓取到任何内容 页码" + AuthorPageIndex);
                            return(0);
                        }
                    }
                }
                var isHaveMore = false;
                //strContent = FormatJsonData(strContent);
                var data = JsonConvert.DeserializeObject <DtoBaijiahaoAuthorJsData>(strContent);
                if (data != null)
                {
                    Log.Info(url + " 页码" + AuthorPageIndex);

                    #region === 处理data中的数据,存储新闻信息 ===

                    if (data.items != null && data.items.Count > 0)
                    {
                        if (data.total > (AuthorPageIndex + 1) * 12)
                        {
                            isHaveMore = true;
                        }
                        foreach (var subItem in data.items)
                        {
                            try
                            {
                                var pubTime = Comm.Tools.Utility.StringConverter.ToDateTime(subItem.publish_at);
                                //一个月前的新闻不抓取
                                if (pubTime.AddMonths(1) < DateTime.Now)
                                {
                                    Log.Info("发布时间在1月前不入库 appid=" + subItem.app_id + " pubtime=" + subItem.publish_at + " title=" + subItem.title);
                                    continue;
                                }
                                var newsId = DalNews.IsExistsNews_Bjh(authorId, subItem.title);
                                if (newsId < 1)
                                {
                                    #region === 不存在的插入===
                                    var model = new DtoNews()
                                    {
                                        Author     = "",
                                        AuthorId   = authorId,
                                        Contents   = "",
                                        CreateTime = DateTime.Now,
                                        //CurReadTimes = Global.ToInt(subItem.read_amount),
                                        CurReadTimes    = subItem.read_amount,
                                        FromSiteName    = "baijiahao",
                                        FromUrl         = subItem.url,
                                        IntervalMinutes = 60,
                                        IsDeal          = 0,
                                        IsHot           = 0,
                                        IsShow          = 1,
                                        LastDealTime    = DateTime.Now,
                                        LastReadTimes   = subItem.read_amount,
                                        LogoOriginalUrl = subItem.url,
                                        LogoUrl         = "",
                                        NewsHotClass    = 7,
                                        NewsTypeId      = (int)NewsTypeEnum.新闻,
                                        PubTime         = Comm.Tools.Utility.StringConverter.ToDateTime(subItem.publish_at),
                                        Tags            = subItem.tag,
                                        Title           = subItem.title,
                                        TotalComments   = subItem.comment_amount,
                                        RefreshTimes    = 0,
                                        GroupId         = subItem.app_id
                                    };
                                    DalNews.Insert_News_Bjh(model);
                                    #endregion
                                }
                                else
                                {
                                    #region === 存在的则更新数据 ===
                                    var oldNews = DalNews.GetNews_Bjh(newsId);

                                    if (oldNews != null)
                                    {
                                        //b、变化数据,如果是当天发稿的文章,每15分钟刷新一次阅读量,如果5、6、7级,则改为小时更新;
                                        //7天内发稿的文章,每一小时更新一次阅读数;
                                        //7天以上,每天刷新;
                                        //(这个可以按欢迎度级别优化,如15分钟阅读增加在10000以上为1级,5000以上为2级,2500以上为3级,1000以上为4级,500以上为5级,100以上为6级,100以下为7级)
                                        var isHot           = 0;
                                        var minutes         = (DateTime.Now - oldNews.LastDealTime).TotalMinutes;
                                        var newsClassId     = 7;
                                        var addReads        = subItem.read_amount - oldNews.CurReadTimes;
                                        var intervalMinutes = 24 * 60;
                                        if (addReads > 0)
                                        {
                                            if (minutes > 60)
                                            {
                                                var perHourReads = addReads / (minutes / 60.0);
                                                if (perHourReads > 10000)
                                                {
                                                    isHot = 1;
                                                }
                                            }
                                            else
                                            {
                                                if (addReads > 10000)
                                                {
                                                    isHot = 1;
                                                }
                                            }
                                            #region === 15分钟阅读量分析 ===
                                            var per15MinutesReads = addReads / (minutes / 15.0);
                                            if (per15MinutesReads > 10000)
                                            {
                                                newsClassId     = 1;
                                                isHot           = 1;
                                                intervalMinutes = 15;
                                            }
                                            else if (per15MinutesReads > 5000)
                                            {
                                                newsClassId     = 2;
                                                isHot           = 1;
                                                intervalMinutes = 15;
                                            }
                                            else if (per15MinutesReads > 2500)
                                            {
                                                newsClassId     = 3;
                                                intervalMinutes = 15;
                                            }
                                            else if (per15MinutesReads > 1000)
                                            {
                                                newsClassId     = 4;
                                                intervalMinutes = 15;
                                            }
                                            else if (per15MinutesReads > 500)
                                            {
                                                newsClassId     = 5;
                                                intervalMinutes = 60;
                                            }
                                            else if (per15MinutesReads > 100)
                                            {
                                                newsClassId     = 6;
                                                intervalMinutes = 60;
                                            }
                                            else
                                            {
                                                newsClassId     = 7;
                                                intervalMinutes = 60;
                                            }
                                            #endregion
                                        }
                                        if (oldNews.PubTime.AddHours(24) < DateTime.Now)
                                        {
                                            //不是今天发布的
                                            intervalMinutes = 24 * 60;
                                        }

                                        //如果原来是爆文的不修改ishot
                                        if (oldNews.IsHot == 1)
                                        {
                                            isHot = 1;
                                        }
                                        if (oldNews.NewsHotClass < newsClassId)
                                        {
                                            newsClassId = oldNews.NewsHotClass;
                                        }
                                        var model = new DtoNews()
                                        {
                                            Id              = newsId,
                                            LastReadTimes   = oldNews.CurReadTimes,
                                            CurReadTimes    = subItem.read_amount,
                                            IsHot           = isHot,
                                            IsDeal          = 1,
                                            TotalComments   = subItem.comment_amount,
                                            IntervalMinutes = intervalMinutes,
                                            NewsHotClass    = newsClassId,
                                            LastDealTime    = DateTime.Now,
                                        };

                                        DalNews.UpdateNews_Bjh(model);

                                        //暂不更新作者表的刷新时间,没用上
                                        //DalNews.UpdateAuthorInterval(authorId, intervalMinutes);
                                    }
                                    #endregion
                                }
                            }
                            catch (Exception ex)
                            {
                            }
                        }
                    }
                    #endregion

                    //Random rnd = new Random();
                    //有更多数据,则继续抓取数据
                    if (isHaveMore)
                    {
                        //sleep
                        //Thread.Sleep(rnd.Next(1000, 2500));
                        Thread.Sleep(200);
                        AuthorPageIndex++;

                        DealAuthorData(url, authorId, groupId, AuthorPageIndex);
                    }
                    else
                    {
                        Log.Info("本百家号抓取结束总页数" + AuthorPageIndex);
                        //置位状态
                        //DalNews.UpdateAuthorIsDeal(authorId, 1);
                        AuthorPageIndex = 0;
                        //Thread.Sleep(rnd.Next(2000, 5000));
                        Thread.Sleep(200);
                    }
                }
                else
                {
                    Log.Info(url + " 百家号未取到数据 页码" + AuthorPageIndex);
                }
            }
            catch (Exception ex)
            {
                Log.Error(ex.Message + ex.StackTrace);
                Log.Debug("======strContent begin 百家号抓取=========");
                Log.Debug(url);
                Log.Debug(strContent);
                Log.Debug("======strContent end =========");
            }
            return(1);
        }
コード例 #6
0
        public List <DtoNewsUrlList> NewsUrlGathering(string newsListUrl, int newsType)
        {
            try
            {
                Log.Info(newsListUrl + " 抓取开始");
                var strContent = HttpHelper.GetContentByMobileAgent(newsListUrl, Encoding.GetEncoding("gb2312"));
                if (string.IsNullOrWhiteSpace(strContent))
                {
                    Log.Info(newsListUrl + " 未抓取到任何内容");
                    return(null);
                }

                //取得标题列表
                var strList = XpathHelper.GetInnerHtmlListByXPath(strContent, "//div[@class='leftList']/ul/li");

                if (strList != null && strList.Count > 0)
                {
                    foreach (var item in strList)
                    {
                        try
                        {
                            var url   = XpathHelper.GetAttrValueByXPath(item, "//a", "href");
                            var title = XpathHelper.GetInnerHtmlByXPath(item, "//a", "");
                            title = StrHelper.FormatHtml(title).Trim();


                            var isHave = DalNews.IsExistsNews(title);
                            //如果已存在则跳过
                            if (isHave)
                            {
                                continue;
                            }

                            if (newsType == 100 || newsType == 200 || newsType == 300)
                            {
                                #region === 根据详细页地址取新闻内容 ===

                                var news = NewsGathering(url);
                                if (news != null)
                                {
                                    news.NewsTypeId = newsType;
                                    news.Title      = title;

                                    news.PubTime = StrHelper.ToDateTime(StrHelper.FormatPubTime(news.PubTime.ToString()));
                                    //入库
                                    var newsId = DalNews.Insert(news);
                                    if (newsId < 1)
                                    {
                                        continue;
                                    }

                                    //从内容中提取img,存入newsmedia
                                    var mediaList = ImgDeal.GetImgList(news.Contents);
                                    if (mediaList != null && mediaList.Count > 0)
                                    {
                                        news.Contents = mediaList[0].Description;

                                        foreach (var picitem in mediaList)
                                        {
                                            picitem.NewsId = newsId;
                                            DalNews.InsertMedia(picitem);
                                        }
                                    }

                                    //休眠 控制抓取的频率
                                    Random rnd          = new Random();
                                    var    sleepSeconds = rnd.Next(30, 90);
                                    Thread.Sleep(sleepSeconds * 1000);
                                }

                                #endregion
                            }
                            if (newsType == 400)
                            {
                                #region === 根据详细页地址取图片内容 ===

                                var mediaList = NewsPicGathering(url);

                                var news = new DtoNews()
                                {
                                    Title      = title,
                                    FromUrl    = url,
                                    NewsTypeId = newsType,
                                };
                                if (mediaList != null && mediaList.Count > 0)
                                {
                                    news.Contents = mediaList[0].Description;

                                    //入库
                                    var newsId = DalNews.Insert(news);

                                    foreach (var picitem in mediaList)
                                    {
                                        picitem.NewsId = newsId;
                                        DalNews.InsertMedia(picitem);
                                    }
                                }

                                //休眠 控制抓取的频率
                                Random rnd          = new Random();
                                var    sleepSeconds = rnd.Next(30, 90);
                                Thread.Sleep(sleepSeconds * 1000);

                                #endregion
                            }
                            Log.Info(url + " 抓取完成");
                        }
                        catch (Exception ex)
                        {
                            //Log.Error("内容: " + item);
                            Log.Error(ex.Message + ex.StackTrace);
                        }
                    }
                }
                Log.Info(newsListUrl + " 抓取结束");
                return(null);
            }
            catch (Exception ex)
            {
                Log.Error(ex.Message + ex.StackTrace);
            }
            return(null);
        }
コード例 #7
0
        public DtoNews NewsGathering(string newsUrl)
        {
            try
            {
                var title   = "";
                var content = "";
                var pubTime = "";
                var from    = "";
                var author  = "";
                var picUrl  = "";

                var strNewContent = HttpHelper.GetContent(newsUrl, Encoding.GetEncoding("gb2312"));

                content = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='Cnt-Main-Article-QQ']", "");

                //从content里去除最下面的广告部分
                //var contentlast = StrHelper.GetStrByXPath(content, "//span[last()]", "");
                //content = content.Replace(contentlast, "");
                //content = Regex.Replace(content, contentlast, "", RegexOptions.IgnoreCase);


                //从content里去除最上面的分享部分
                //var contentfirst = StrHelper.GetStrByXPath(strNewContent, "//div[@class='tit-bar clearfix']", "");
                //content = content.Replace(contentfirst, "");
                //content = content.Replace("<div class='tit-bar clearfix' bosszone='titleDown'></div>","");
                content = content.Trim();

                pubTime = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//span[@class='article-time']", "");
                pubTime = StrHelper.FormatHtml(pubTime).Trim();
                from    = "腾讯佛学";
                //from = StrHelper.GetStrByXPath(strNewContent, "//span[@bosszone='jgname']/a", "");
                //from = StrHelper.FormatHtml(from);

                var picUrlList = XpathHelper.GetAttrValueListByXPath(content, "//img", "src");
                if (picUrlList != null && picUrlList.Count > 0)
                {
                    picUrl = picUrlList[0];
                }

                author = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='C-Main-Article-QQ']/div[1]/div/div[1]/span[5]", "");
                author = StrHelper.FormatHtml(author).Trim();


                //*[@id="Cnt-Main-Article-QQ"]/p/div[@r='1']
                content = DealContent(content);

                var news = new DtoNews
                {
                    Contents        = content,
                    Title           = title,
                    PubTime         = StrHelper.ToDateTime(pubTime),
                    FromUrl         = newsUrl,
                    FromSiteName    = from,
                    Author          = author,
                    CreateTime      = DateTime.Now,
                    IsShow          = 1,
                    LogoOriginalUrl = picUrl,
                    LogoUrl         = picUrl
                };
                return(news);
            }
            catch (Exception ex)
            {
                Log.Error(newsUrl + " 错误:" + ex.Message + ex.StackTrace);
            }
            return(null);
        }
コード例 #8
0
ファイル: DalNews.cs プロジェクト: thachgiasoft/Lfb.DataGrab
        /// <summary>
        /// 添加一条新闻
        /// </summary>
        /// <param name="model">新闻实体</param>
        /// <returns></returns>
        public static int Insert(DtoNews model)
        {
            try
            {
                if (model.Author == null)
                {
                    model.Author = "";
                }
                if (model.Contents == null)
                {
                    model.Contents = "";
                }
                if (model.FromSiteName == null)
                {
                    model.FromSiteName = "";
                }
                if (model.PubTime == null)
                {
                    model.PubTime = DateTime.Now;
                }
                if (model.FromUrl == null)
                {
                    model.FromUrl = "";
                }
                if (model.LogoOriginalUrl == null)
                {
                    model.LogoOriginalUrl = "";
                }
                if (model.LogoUrl == null)
                {
                    model.LogoUrl = "";
                }
                if (model.Title == null)
                {
                    model.Title = "";
                }

                ////非图片的,且内容小于100的不入库
                //if (model.NewsTypeId != NewsTypeEnum.图片  && model.Contents.Length < 100)
                //{
                //    return -1;
                //}
                //if(string.IsNullOrWhiteSpace(model.Title.Trim()))
                //{
                //    return -1;
                //}

                var item = new T_News()
                {
                    Author   = model.Author,
                    Contents = model.Contents,
                    //CreateTime = model.CreateTime,
                    FromSiteName    = model.FromSiteName,
                    FromUrl         = model.FromUrl,
                    IsShow          = 0,
                    LogoOriginalUrl = model.LogoOriginalUrl,
                    LogoUrl         = model.LogoUrl,
                    NewsTypeId      = (int)model.NewsTypeId,
                    PubTime         = model.PubTime,
                    Title           = model.Title,
                    AuthorId        = model.AuthorId,
                    TotalComments   = model.TotalComments,
                    Tags            = model.Tags,
                    NewsHotClass    = model.NewsHotClass,
                    LastReadTimes   = model.LastReadTimes,
                    LastDealTime    = DateTime.Now,
                    IsHot           = model.IsHot,
                    IsDeal          = model.IsDeal,
                    IntervalMinutes = model.IntervalMinutes,
                    CurReadTimes    = model.CurReadTimes,
                    CreateTime      = DateTime.Now,
                    GroupId         = model.GroupId,
                };


                var id = Sql.InsertId <T_News>(item);

                return(id);
            }
            catch (Exception ex)
            {
                Log.Error(ex.Message + ex.StackTrace);
                return(-1);
            }
        }