Exemplo n.º 1
0
        public override void GetNewsDetail(ChannelConfig chlCfg)
        {
            var url = string.Format(site.IndexUrl, chlCfg.ChannelVal);
            NewsCategory cate;
            if (chlCfg.ChannelName != "网易原创")
            {
                cate = cateList.Find(p => p.CategoryName == chlCfg.ChannelName.Replace("网易", ""));
            }
            else
            {
                cate = cateList.Find(p => p.CategoryName == chlCfg.ChannelName);
            }
            if (cate == null)
            {
                return;
            }
            try
            {

                var str = HttpUtility.Get(url);
                str = str.Replace("{\"" + chlCfg.ChannelVal + "\":[", "[").Replace("]}", "]");
                var articlList = SerilizeService<List<ArticleList>>.CreateSerilizer(Serilize_Type.Json).Deserilize(str);

                foreach (var item in articlList)
                {
                    if (string.IsNullOrEmpty(item.url))
                    {
                        continue;
                    }
                    //var newsParam = SqlParamHelper.GetDefaultParam(1, 10, "NewsId", true);
                    //newsParam.where.where.Add(SqlParamHelper.CreateWhere(
                    //PARAM_TYPE.EQUATE, LINK_TYPE.AND, "SourceUrl", item.url));
                    var newsItem = newsItemAccess.Find(p => p.SourceUrl == item.url);
                    if (newsItem == null)
                    {
                        newsItem = new NewsItem()
                        {
                            NewsId = -1,
                            SourceUrl = item.url,
                            SourceSite = site.SiteId,
                            Author = ""
                        };
                    }
                    else
                    {
                        continue;
                    }
                    newsItem.CategoryId = cate.CategoryId;
                    newsItem.Title = item.title;
                    newsItem.CreateTime = item.ptime;
                    newsItem.FromSite = item.source;
                    newsItem.ImgUrl = item.imgsrc ?? "";
                    newsItem.ChannelName = chlCfg.ChannelName;
                    //采集内容          http://c.3g.163.com/nc/article/B0DQ29J400031H2L/full.html
                    HtmlDocument doc = new HtmlDocument();
                    HtmlNode.ElementsFlags.Remove("option");
                    for (int i = 0; i < 5; i++)
                    {
                        try
                        {
                            var sourceUrl = string.Format(site.DetailUrl, item.docid);
                            str = HttpUtility.Get(sourceUrl);
                            str = str.Replace("{\"" + item.docid + "\":{", "{").Replace("}}", "}");

                            var article = SerilizeService<ArticleDetail>.CreateSerilizer(Serilize_Type.Json).Deserilize(str);
                            if (article.img != null)
                            {
                                foreach (var img in article.img)
                                {
                                    article.body =
                                       string.Format("<p><img src=\"{0}\" itemprop=\"image\" alt=\"\" >{1}</ p >", img.src, img.alt)
                                        + article.body;
                                }
                            }

                            doc.LoadHtml(article.body);
                            break;
                        }
                        catch (Exception ex)
                        {
                            Logger.WriteException(string.Format("请求详情页失败,次数:{0} , url:{1}", i, newsItem.SourceUrl), ex);
                        }
                    }
                    try
                    {
                        var div = doc.DocumentNode;
                        if (string.IsNullOrEmpty(div.InnerText))
                        {
                            continue;
                        }
                        var matchstr = (string)div.InnerText.ToString().Clone();
                        MatchCollection matchList = Regex.Matches(matchstr, @"<!--\S*-->");
                        foreach (Match match in matchList)
                        {
                            matchstr = matchstr.Replace(match.Value, "");
                        }
                        newsItem.NewsText = matchstr;
                        RemoveUnsafe(div);
                        newsItem.NewsContent = div.InnerHtml;
                        //保存新闻列表
                        newsItemAccess.Add(newsItem);
                        newsItemAccess.Save();
                        SaveSegMents(newsItem);
                    }
                    catch (Exception ex)
                    {
                        Logger.WriteException("保存新闻异常", ex);
                    }
                }
            }
            catch (Exception ex)
            {
                Logger.WriteException(string.Format("分类下抓取新闻出现异常:{0}", chlCfg.ChannelName), ex);
            }
        }
Exemplo n.º 2
0
        public override void UpdateSiteChl()
        {
            //分类整理
            var url = site.ChannelUrl;// "http://c.3g.163.com/nc/topicset/ios/subscribe/manage/listspecial.html";
            var str = HttpUtility.Get(url);
            str = str.Replace("{\"tList\":[", "[").Replace("]}", "]");
            var topicList = SerilizeService<List<Topic>>.CreateSerilizer(Serilize_Type.Json).Deserilize(str);

            foreach (var item in topicList)
            {
                item.tname = "网易" + item.tname;
                var chlCfg = new ChannelConfig();
                chlCfg.ChannelId = -1;
                chlCfg.ChannelName = item.tname;
                chlCfg.ChannelVal = item.tid;
                chlCfg.SiteId = site.SiteId;
                var _chl = chlCfgAccess.Find(
                    p => p.ChannelVal == item.tid
                    && p.ChannelName == item.tname
                    && p.SiteId == 3
                    );
                if (_chl == null)
                {
                    chlCfgAccess.Add(chlCfg);
                    chlCfgAccess.Save();
                }
                else
                {
                    continue;
                }

            }
        }
 public override void GetNewsDetail(ChannelConfig chlCfg)
 {
     var url = string.Format(site.IndexUrl + "&limit=500", chlCfg.ChannelVal);
     var cate = cateList.Find(p => p.CategoryName == "财经");
     try
     {
         HttpResponse<JSON> response = new CommonService.HttpResponse<JSON>();
         var json = response.GetFuncGetResponse(url, Serilize_Type.Json);
         //yw?encode=ywjh&limit=500
         var uri = url.Split('?')[0];
         foreach (var item in json.news)
         {
             //var newsParam = SqlParamHelper.GetDefaultParam(1, 10, "NewsId", true);
             //newsParam.where.where.Add(SqlParamHelper.CreateWhere(
             //PARAM_TYPE.EQUATE, LINK_TYPE.AND, "SourceUrl", item.url_m));
             var newsItem = newsItemAccess.Find(
                 p => p.SourceUrl == item.url_m
                 );
             if (newsItem == null)
             {
                 newsItem = new NewsItem()
                 {
                     NewsId = -1,
                     SourceUrl = item.url_m,
                     SourceSite = site.SiteId,
                     FromSite = "东方财富网",
                     Author = ""
                 };
             }
             else
             {
                 continue;
             }
             newsItem.CategoryId = cate.CategoryId;
             newsItem.Title = item.title;
             newsItem.CreateTime = item.showtime;
             newsItem.ImgUrl = item.image ?? "";
             newsItem.ChannelName = chlCfg.ChannelName;
             //采集内容
             HtmlDocument doc = new HtmlDocument();
             HtmlNode.ElementsFlags.Remove("option");
             url = uri.Replace("yw", "content?newsid=" + item.newsid.ToString());
             for (int i = 0; i < 5; i++)
             {
                 try
                 {
                     HttpResponse<ContentJson> contentResponse = new CommonService.HttpResponse<ContentJson>();
                     var contentJson = contentResponse.GetFuncGetResponse(url, Serilize_Type.Json);
                     doc.LoadHtml(contentJson.body);
                     break;
                 }
                 catch (Exception ex)
                 {
                     Logger.WriteException(string.Format("请求详情页失败,次数:{0} , url:{1}", i, newsItem.SourceUrl), ex);
                 }
             }
             try
             {
                 var div = doc.DocumentNode;
                 if (string.IsNullOrEmpty(div.InnerText))
                 {
                     continue;
                 }
                 newsItem.NewsText = div.InnerText.Replace("<!-- EM_StockImg_Start --><!--IMG#0--><!-- EM_StockImg_End -->", "");
                 RemoveUnsafe(div);
                 newsItem.NewsContent = div.InnerHtml;
                 //保存新闻列表
                 newsItemAccess.Add(newsItem);
                 newsItemAccess.Save();
                 SaveSegMents(newsItem);
             }
             catch (Exception ex)
             {
                 Logger.WriteException("保存内容异常", ex);
             }
         }
     }
     catch (Exception ex)
     {
         Logger.WriteException(string.Format("分类下抓取新闻出现异常:{0}", chlCfg.ChannelName), ex);
     }
 }
        public override void GetNewsDetail(ChannelConfig chlCfg)
        {
            var url = string.Format(site.IndexUrl, chlCfg.ChannelVal);
            var cate = cateList.Find(p => p.CategoryName == "财经");
            try
            {
                while (!string.IsNullOrEmpty(url))
                {
                    HttpResponse<xmlColumn> response = new CommonService.HttpResponse<xmlColumn>();
                    var tonghuaSunXmlColumn = response.GetFuncGetResponse(url, Serilize_Type.Xml);
                    DateTime dt = System.DateTime.Now;
                    url = tonghuaSunXmlColumn.nextPage ?? "";
                    foreach (var item in tonghuaSunXmlColumn.pageItems.item)
                    {
                        //var newsParam = SqlParamHelper.GetDefaultParam(1, 10, "NewsId", true);
                        //newsParam.where.where.Add(SqlParamHelper.CreateWhere(
                        //PARAM_TYPE.EQUATE, LINK_TYPE.AND, "SourceUrl", item.url));
                        var newsItem = newsItemAccess.Find(p => p.SourceUrl == item.url);
                        if (newsItem == null)
                        {
                            newsItem = new NewsItem() { NewsId = -1, SourceUrl = item.url, SourceSite = site.SiteId, Author = "" };
                        }
                        else
                        {
                            continue;
                        }
                        newsItem.CategoryId = cate.CategoryId;
                        newsItem.Title = item.title;
                        DateTime.TryParse(item.ctime, out dt);
                        newsItem.CreateTime = dt;
                        newsItem.FromSite = item.source;
                        newsItem.ImgUrl = item.imgurl ?? "";
                        newsItem.ChannelName = tonghuaSunXmlColumn.columnName;
                        //采集内容
                        HtmlDocument doc = new HtmlDocument();
                        HtmlNode.ElementsFlags.Remove("option");
                        for (int i = 0; i < 5; i++)
                        {
                            try
                            {
                                doc.LoadHtml(HttpUtility.Get(newsItem.SourceUrl, Encoding.UTF8));
                                break;
                            }
                            catch (Exception ex)
                            {
                                Logger.WriteException(string.Format("请求详情页失败,次数:{0} , url:{1}", i, newsItem.SourceUrl), ex);
                            }
                        }
                        try
                        {

                            var div = doc.DocumentNode.SelectSingleNode("//div[@id='content']");
                            if (string.IsNullOrEmpty(div.InnerText))
                            {
                                continue;
                            }
                            newsItem.NewsText = div.InnerText;
                            RemoveUnsafe(div);
                            newsItem.NewsContent = div.InnerHtml;
                            //保存新闻列表
                            newsItemAccess.Add(newsItem);
                            newsItemAccess.Save();
                            SaveSegMents(newsItem);
                        }
                        catch (Exception ex)
                        {
                            Logger.WriteException("保存新闻异常", ex);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Logger.WriteException(string.Format("分类下抓取新闻出现异常:{0}", chlCfg.ChannelName), ex);
            }
        }
 public override void UpdateSiteChl()
 {
     HttpResponse<xmlIndex> httpResponse = new HttpResponse<xmlIndex>();
     var listIndex = httpResponse.GetFuncGetResponse(site.ChannelUrl, Serilize_Type.Xml);
     foreach (var item in listIndex.item)
     {
         var chlCfg = new ChannelConfig();
         chlCfg.ChannelId = -1;
         chlCfg.ChannelName = item.name;
         chlCfg.ChannelVal = item.columnId;
         chlCfg.SiteId = site.SiteId;
         var _chl = chlCfgAccess.Find(p => p.SiteId == chlCfg.SiteId
           && p.ChannelVal == chlCfg.ChannelVal
           && p.ChannelName == chlCfg.ChannelName
           );
         if (_chl != null)
         {
             chlCfg.ChannelId = _chl.ChannelId;
             chlCfg.ChannelName = item.name;
             chlCfg.ChannelVal = item.columnId;
             chlCfg.SiteId = site.SiteId;
         }
         else
         {
             chlCfgAccess.Add(chlCfg);
         }
         chlCfgAccess.Save();
     }
 }
 public virtual void GetNewsDetail(ChannelConfig chlCfg)
 {
 }