public DtoNews NewsGathering(string newsUrl)
        {
            try
            {
                var title   = "";
                var content = "";
                var pubTime = "";
                var from    = "";
                var author  = "";

                var strNewContent = HttpHelper.GetContentByMobileAgent(newsUrl, Encoding.UTF8);

                content = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_real']", "").Trim();
                pubTime = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_sth']/p/span[1]", "");
                pubTime = StrHelper.FormatHtml(pubTime).Trim();
                from    = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_sth']/p/span[3]/span", "");
                from    = StrHelper.FormatHtml(from).Trim();
                author  = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='artical_sth']/p/span[4]/span", "");
                author  = StrHelper.FormatHtml(author).Trim();
                if (string.IsNullOrWhiteSpace(from))
                {
                    from = "凤凰佛教";
                }
                content = DealContent(content);

                var news = new DtoNews
                {
                    Contents     = content,
                    Title        = title,
                    PubTime      = StrHelper.ToDateTime(pubTime),
                    FromUrl      = newsUrl,
                    FromSiteName = from,
                    Author       = author,
                    CreateTime   = DateTime.Now,
                    IsShow       = 0,
                };
                return(news);
            }
            catch (Exception ex)
            {
                Log.Error(newsUrl + " 错误:" + ex.Message + ex.StackTrace);
            }
            return(null);
        }
Beispiel #2
0
        public List <DtoNewsUrlList> NewsUrlGathering(string newsListUrl, int newsType)
        {
            try
            {
                Log.Info(newsListUrl + " 抓取开始");
                var strContent = HttpHelper.GetContentByMobileAgent(newsListUrl, Encoding.GetEncoding("gb2312"));
                if (string.IsNullOrWhiteSpace(strContent))
                {
                    Log.Info(newsListUrl + " 未抓取到任何内容");
                    return(null);
                }

                //取得标题列表
                var strList = XpathHelper.GetInnerHtmlListByXPath(strContent, "//div[@class='leftList']/ul/li");

                if (strList != null && strList.Count > 0)
                {
                    foreach (var item in strList)
                    {
                        try
                        {
                            var url   = XpathHelper.GetAttrValueByXPath(item, "//a", "href");
                            var title = XpathHelper.GetInnerHtmlByXPath(item, "//a", "");
                            title = StrHelper.FormatHtml(title).Trim();


                            var isHave = DalNews.IsExistsNews(title);
                            //如果已存在则跳过
                            if (isHave)
                            {
                                continue;
                            }

                            if (newsType == 100 || newsType == 200 || newsType == 300)
                            {
                                #region === 根据详细页地址取新闻内容 ===

                                var news = NewsGathering(url);
                                if (news != null)
                                {
                                    news.NewsTypeId = newsType;
                                    news.Title      = title;

                                    news.PubTime = StrHelper.ToDateTime(StrHelper.FormatPubTime(news.PubTime.ToString()));
                                    //入库
                                    var newsId = DalNews.Insert(news);
                                    if (newsId < 1)
                                    {
                                        continue;
                                    }

                                    //从内容中提取img,存入newsmedia
                                    var mediaList = ImgDeal.GetImgList(news.Contents);
                                    if (mediaList != null && mediaList.Count > 0)
                                    {
                                        news.Contents = mediaList[0].Description;

                                        foreach (var picitem in mediaList)
                                        {
                                            picitem.NewsId = newsId;
                                            DalNews.InsertMedia(picitem);
                                        }
                                    }

                                    //休眠 控制抓取的频率
                                    Random rnd          = new Random();
                                    var    sleepSeconds = rnd.Next(30, 90);
                                    Thread.Sleep(sleepSeconds * 1000);
                                }

                                #endregion
                            }
                            if (newsType == 400)
                            {
                                #region === 根据详细页地址取图片内容 ===

                                var mediaList = NewsPicGathering(url);

                                var news = new DtoNews()
                                {
                                    Title      = title,
                                    FromUrl    = url,
                                    NewsTypeId = newsType,
                                };
                                if (mediaList != null && mediaList.Count > 0)
                                {
                                    news.Contents = mediaList[0].Description;

                                    //入库
                                    var newsId = DalNews.Insert(news);

                                    foreach (var picitem in mediaList)
                                    {
                                        picitem.NewsId = newsId;
                                        DalNews.InsertMedia(picitem);
                                    }
                                }

                                //休眠 控制抓取的频率
                                Random rnd          = new Random();
                                var    sleepSeconds = rnd.Next(30, 90);
                                Thread.Sleep(sleepSeconds * 1000);

                                #endregion
                            }
                            Log.Info(url + " 抓取完成");
                        }
                        catch (Exception ex)
                        {
                            //Log.Error("内容: " + item);
                            Log.Error(ex.Message + ex.StackTrace);
                        }
                    }
                }
                Log.Info(newsListUrl + " 抓取结束");
                return(null);
            }
            catch (Exception ex)
            {
                Log.Error(ex.Message + ex.StackTrace);
            }
            return(null);
        }
Beispiel #3
0
        public DtoNews NewsGathering(string newsUrl)
        {
            try
            {
                var title   = "";
                var content = "";
                var pubTime = "";
                var from    = "";
                var author  = "";
                var picUrl  = "";

                var strNewContent = HttpHelper.GetContent(newsUrl, Encoding.GetEncoding("gb2312"));

                content = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='Cnt-Main-Article-QQ']", "");

                //从content里去除最下面的广告部分
                //var contentlast = StrHelper.GetStrByXPath(content, "//span[last()]", "");
                //content = content.Replace(contentlast, "");
                //content = Regex.Replace(content, contentlast, "", RegexOptions.IgnoreCase);


                //从content里去除最上面的分享部分
                //var contentfirst = StrHelper.GetStrByXPath(strNewContent, "//div[@class='tit-bar clearfix']", "");
                //content = content.Replace(contentfirst, "");
                //content = content.Replace("<div class='tit-bar clearfix' bosszone='titleDown'></div>","");
                content = content.Trim();

                pubTime = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//span[@class='article-time']", "");
                pubTime = StrHelper.FormatHtml(pubTime).Trim();
                from    = "腾讯佛学";
                //from = StrHelper.GetStrByXPath(strNewContent, "//span[@bosszone='jgname']/a", "");
                //from = StrHelper.FormatHtml(from);

                var picUrlList = XpathHelper.GetAttrValueListByXPath(content, "//img", "src");
                if (picUrlList != null && picUrlList.Count > 0)
                {
                    picUrl = picUrlList[0];
                }

                author = XpathHelper.GetInnerHtmlByXPath(strNewContent, "//div[@id='C-Main-Article-QQ']/div[1]/div/div[1]/span[5]", "");
                author = StrHelper.FormatHtml(author).Trim();


                //*[@id="Cnt-Main-Article-QQ"]/p/div[@r='1']
                content = DealContent(content);

                var news = new DtoNews
                {
                    Contents        = content,
                    Title           = title,
                    PubTime         = StrHelper.ToDateTime(pubTime),
                    FromUrl         = newsUrl,
                    FromSiteName    = from,
                    Author          = author,
                    CreateTime      = DateTime.Now,
                    IsShow          = 1,
                    LogoOriginalUrl = picUrl,
                    LogoUrl         = picUrl
                };
                return(news);
            }
            catch (Exception ex)
            {
                Log.Error(newsUrl + " 错误:" + ex.Message + ex.StackTrace);
            }
            return(null);
        }