コード例 #1
0
        public ActionResult CreateNewInfo(T_Information model)
        {
            model.CreateTime             = model.CreateTime ?? DateTime.Now;
            dbContent.Entry(model).State = EntityState.Added;
            var file = Request.Files["file_conver"];

            model.Conver = SaveFile(file);
            dbContent.SaveChanges();
            return(Ok());
        }
コード例 #2
0
        public ActionResult ModifyNewInfo(T_Information model)
        {
            dbContent.Entry(model).State = EntityState.Modified;
            var file = Request.Files["file_conver"];

            if (file != null && file.ContentLength > 0)
            {
                model.Conver = SaveFile(file);
            }
            else
            {
                dbContent.Entry(model).Property(x => x.Conver).IsModified = false;
            }
            dbContent.SaveChanges();
            return(Ok());
        }
コード例 #3
0
        /// <summary>
        /// 抓取青网本地信息
        /// </summary>
        public static void InfomationDetailCrawler(string url, XmlNode infoNode, T_Information info, v_crawler crawler, string encode)
        {
            var infoDetailCrawler = new SimpleCrawler();//新建一个爬虫服务

            infoDetailCrawler.OnError += (s, e) =>
            {
                var ee = new CrawlerException()
                {
                    crawlertype      = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                    exceptionbrief   = "详情抓取出错",
                    exceptionmessage = e.Exception.Message,
                    statuscode       = 500,
                    serviceid        = crawler.id
                };
                throw ee;
            };
            infoDetailCrawler.OnCompleted += async(s, e) =>
            {
                try
                {
                    using (var db = new BizDataContext())
                    {
                        await SaveInfomationDetail(e.PageSource, info, infoNode, db, crawler, url);
                    }
                }
                catch (Exception ex)
                {
                    var ee = new CrawlerException()
                    {
                        crawlertype      = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                        exceptionbrief   = "详情解析出错",
                        exceptionmessage = ex.Message,
                        statuscode       = 500,
                        serviceid        = crawler.id
                    };
                    await CommonHelper.SaveException(ee);
                }
            };

            infoDetailCrawler.Start(new Uri(url), encode).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
コード例 #4
0
        /// <summary>
        /// 解析数据
        /// </summary>
        /// <param name="html">原始html</param>
        /// <param name="crawler">抓取服务配置</param>
        /// <param name="infoNode">xml配置数据</param>
        /// <param name="db">数据库链接</param>
        /// <param name="encode">html编码格式</param>
        /// <param name="url">抓取原始url</param>
        /// <returns></returns>
        public static void SaveInfomation(string html, v_crawler crawler, XmlDocument infoNode, BizDataContext db)
        {
            //页面数据的根节点
            var rootNode = CommonHelper.GetRootNode(html);

            //配置数据的根节点
            var xmlRoot = infoNode.SelectSingleNode("data");

            //获取信息列表的配置节点
            var listConfig = xmlRoot.SelectSingleNode("ListConfig");

            //这里是为了最终找到modelListNodes变量的值,也就是新闻列表
            //在此之前可能需要多次的剥离无效数据,因此使用了foreach循环
            //正常流程应该是多次SelectSingleNode后,进行一次SelectNodes,获取到新闻列表
            HtmlNode           modelNode      = null;
            HtmlNodeCollection modelListNodes = null;

            foreach (XmlNode item in listConfig.ChildNodes)
            {
                if (modelNode == null)
                {
                    modelNode = rootNode;
                }

                if (item.Attributes["issingleselect"].Value.ToBool())
                {
                    //多次剥离无效数据
                    modelNode = modelNode.SelectSingleNode(item.Attributes["signstr"].Value);
                }
                else
                {
                    //最终获取到信息列表,此时应该循环结束
                    modelListNodes = modelNode.SelectNodes(item.Attributes["signstr"].Value);
                    break;
                }
            }

            //获取对新闻实体解析的配置节点
            var infoConfig = xmlRoot.SelectSingleNode("InfoConfig");

            //对上面获取到的新闻列表循环处理
            foreach (HtmlNode info in modelListNodes)
            {
                T_Information entity = new T_Information();

                var detailUrl = string.Empty;

                //新闻解析应该包含多个子节点,每个子节点表示一个属性,这里进行循环赋值
                foreach (XmlNode property in infoConfig.ChildNodes)
                {
                    if (property.Name == "property")
                    {
                        entity = CommonHelper.GetProperty(entity, info, property);
                    }
                    else if (property.Name == "DetailUrl")
                    {
                        detailUrl = GetUrl(info, property, crawler.url);
                    }
                }

                var count = db.Set <T_Information>().Where(p => p.Information_Id == entity.Information_Id).Select(p => 1).Count();

                if (count >= 1)
                {
                    return;
                }

                entity.State           = (int)T_InformationStateEnum.Publish;
                entity.InformationType = (int)t_informationtypeenum.customcontent;
                entity.InfoTypeIds     = crawler.infotype + ",";


                if (!string.IsNullOrEmpty(detailUrl))
                {
                    //循环赋值完成后,前往新闻详情页获取新闻详情,完善新闻实体
                    InfomationDetailCrawler(detailUrl, xmlRoot, entity, crawler, crawler.encode);
                }

                entity.OriginalSourceUrl = detailUrl;
            }
        }
コード例 #5
0
        private async static Task SaveInfomationDetail(string pageSource, T_Information info, XmlNode infoNode, BizDataContext db, v_crawler crawler, string detailUrl)
        {
            var rootNode = CommonHelper.GetRootNode(pageSource);

            if (info != null)
            {
                var xmlRoot = infoNode.SelectSingleNode("DataDetail");

                var detailHost = new Regex(@"\S+/").Match(detailUrl).Value;

                //新闻解析应该包含多个子节点,每个子节点表示一个属性,这里进行循环赋值
                foreach (XmlNode property in xmlRoot.ChildNodes)
                {
                    info = CommonHelper.GetProperty(info, rootNode, property, detailHost);
                }


                var info_infotag = new T_InfoType_Information()
                {
                    InfoType_Information_Id = await db.GetNextIdentity_IntAsync(),
                    CreateTime    = DateTime.Now,
                    InformationId = info.Information_Id,
                    InfoTypeId    = crawler.infotype,
                };

                var informationcontent = new T_Information_Content()
                {
                    Information_Content_Id = await db.GetNextIdentity_IntAsync(),
                    Conent      = info.Content,
                    ContentType = (int)Information_Content_ContentTypeEnum.TextWords,

                    OrderIndex    = 0,
                    InformationId = info.Information_Id,
                    State         = 0,
                    CreateTime    = DateTime.Now,
                };

                info.ClassifyId  = (int)InfoSource.Crawler;
                info.PublishTime = info.CreateTime;
                info.IsTop       = false;



                if (!string.IsNullOrEmpty(informationcontent.Conent))
                {
                    var regex    = new Regex("<img.*?>");
                    var imgMtach = regex.Match(info.Content);
                    if (imgMtach.Success)
                    {
                        var img      = imgMtach.Value;
                        var srcMatch = new Regex("src=\".*?\"").Match(img);
                        if (srcMatch.Success)
                        {
                            var src = srcMatch.Value;

                            var att = new T_Attachment()
                            {
                                Attachment_ID = await db.GetNextIdentity_IntAsync(),
                                FilePath      = src,

                                State      = 0,
                                CreateTime = DateTime.Now,
                            };
                            await db.InsertAsync(att);
                        }
                    }

                    await db.InsertAsync(info);

                    await db.InsertAsync(info_infotag);

                    await db.InsertAsync(informationcontent);
                }
            }
        }