示例#1
0
        public async static void Start()
        {
            while (true)
            {
                var waittime = 24 * 60 * 60 * 1000;
                try
                {
                    var crawler = new v_crawler();

                    using (var db = new BizDataContext())
                    {
                        //从数据库获取抓取间隔
                        var temp = db.Set<T_Configuration>().Where(p => p.Configuration_Key == CommonHelper.WEATHERKEY).FirstOrDefault();
                        if (temp != null)
                        {
                            waittime = Int32.Parse(temp.Configuration_Value);
                        }

                        //从数据库获取抓取服务的配置
                        crawler = await db.Set<T_HTZ_CrawlerService>().Where(p => p.State == v_common.YesState && p.ServiceType == (int)HTZ_CrawlerService_ServiceTypeEnum.Weather && p.IsEnable.Value).Select(p => new v_crawler
                        {
                            id = p.HTZ_CrawlerService_Id,
                            infotype = p.InfoType ?? 0,
                            name = p.HTZ_CrawlerService_Name,
                            xmlfile = p.XMLFilePath,
                            crawlertype = (int)HTZ_CrawlerService_ServiceTypeEnum.Weather
                        }).FirstOrDefaultAsync();
                    }


                    //开始天气抓取
                    WeatherCrawler(crawler);

                    //开始生活指数抓取
                    LivingIndexCrawler(crawler);

                    //记录服务状态
                    await CommonHelper.SaveNewState((int)HTZ_ServiceState_ServiceStateEnum.Fine, crawler.id);
                }
                catch (CrawlerException ex)
                {
                    await CommonHelper.SaveException(ex);
                }
                catch (Exception ex)
                {
                    var e = new CrawlerException()
                    {
                        crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                        exceptionbrief = "天气抓取服务错误",
                        exceptionmessage = ex.Message,
                        statuscode = 501,
                        serviceid = 2
                    };
                    await CommonHelper.SaveException(e);
                }
                finally
                {
                    Thread.Sleep(waittime);
                }
            }
        }
示例#2
0
        /// <summary>
        /// 抓取天气信息
        /// </summary>
        public static void WeatherCrawler(v_crawler crawler)
        {
            //获取xml配置文件
            var cfg = new XmlDocument();
            cfg.Load(crawler.xmlfile);

            var rootNode = cfg.SelectSingleNode("data");
            if (rootNode == null)
            {
                var e = new CrawlerException()
                {
                    crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                    exceptionbrief = "配置文件出错",
                    exceptionmessage = "未找到主配置项data",
                    statuscode = 500,
                    serviceid = crawler.id
                };
                throw e;
            }

            var tideCrawler = new SimpleCrawler();//新建一个爬虫

            //抓取错误的处理
            tideCrawler.OnError += (s, e) =>
            {
                var ex = new CrawlerException()
                {
                    crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                    exceptionbrief = "抓取出错",
                    exceptionmessage = e.Exception.Message,
                    statuscode = 500,
                    serviceid = crawler.id
                };
                throw ex;
            };

            //抓取成功后的解析
            tideCrawler.OnCompleted += async (s, e) =>
            {

                try
                {
                    using (var db = new BizDataContext())
                    {
                        await SaveWeekData(e.PageSource, rootNode, db);
                    }
                }
                catch (Exception ex)
                {
                    var ee = new CrawlerException()
                    {
                        crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                        exceptionbrief = "解析出错",
                        exceptionmessage = ex.Message,
                        statuscode = 500,
                        serviceid = crawler.id
                    };
                    throw ee;
                }
            };


            //获取抓取url
            var url = rootNode.Attributes["url"].Value;

            //获取编码格式
            var encode = "utf-8";

            if (rootNode.Attributes["encode"] != null)
            {
                encode = rootNode.Attributes["encode"].Value;
            }

            //启动抓取
            if (!string.IsNullOrEmpty(url))
            {
                tideCrawler.Start(new Uri(url), encode).Wait();
            }
        }
示例#3
0
        /// <summary>
        /// 抓取生活指数信息
        /// </summary>
        public static void LivingIndexCrawler(v_crawler crawler)
        {
            var cfg = new XmlDocument();
            cfg.Load(crawler.xmlfile);

            var rootNode = cfg.SelectSingleNode("data");
            if (rootNode == null)
            {
                var e = new CrawlerException()
                {
                    crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                    exceptionbrief = "配置文件出错",
                    exceptionmessage = "未找到主配置项data",
                    statuscode = 500,
                    serviceid = crawler.id
                };
                throw e;
            }

            var livingIndexCrawler = new SimpleCrawler();//调用刚才写的爬虫程序
            livingIndexCrawler.OnError += (s, e) =>
            {
                var ee = new CrawlerException()
                {
                    crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                    exceptionbrief = "生活指数抓取出错",
                    exceptionmessage = e.Exception.Message,
                    statuscode = 500,
                    serviceid = crawler.id
                };
                throw ee;
            };
            livingIndexCrawler.OnCompleted += async (s, e) =>
            {

                try
                {
                    using (var db = new BizDataContext())
                    {
                        await SaveLivingIndexData(e.PageSource, rootNode, db);
                    }
                }
                catch (Exception ex)
                {
                    var ee = new CrawlerException()
                    {
                        crawlertype = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                        exceptionbrief = "详情解析出错",
                        exceptionmessage = ex.Message,
                        statuscode = 500,
                        serviceid = crawler.id
                    };
                    throw ee;
                }
            };


            var url = rootNode.SelectSingleNode("LivingIndexConfig").Attributes["url"].Value;
            var encode = "utf-8";

            if (rootNode.Attributes["encode"] != null)
            {
                encode = rootNode.Attributes["encode"].Value;
            }

            if (!string.IsNullOrEmpty(url))
            {
                livingIndexCrawler.Start(new Uri(url), encode).Wait();
            }
        }
示例#4
0
        /// <summary>
        /// 新闻抓取方法
        /// </summary>
        /// <param name="infotype">抓取的新闻栏目类型</param>
        /// <param name="infoDoc"></param>
        public async static void InfomationCrawler(v_crawler crawler)
        {
            await Task.Run(async() =>
            {
                try
                {
                    var xmlDocument = new XmlDocument();
                    xmlDocument.Load(crawler.xmlfile);

                    var rootNode = xmlDocument.SelectSingleNode("data");
                    if (rootNode == null)
                    {
                        throw new CrawlerException(crawler.id, "配置文件错误", "未找到主配置项data");
                    }

                    crawler.encode = DefaultEncode;

                    if (rootNode.Attributes["encode"] != null)
                    {
                        crawler.encode = rootNode.Attributes["encode"].Value;
                    }

                    crawler.url = rootNode.Attributes["url"].Value;


                    var infoCrawler = new SimpleCrawler();//新建一个抓取服务

                    infoCrawler.OnError += (s, e) =>
                    {
                        throw new CrawlerException(crawler.id, "获取页面代码时错误", e.Exception.Message);
                    };
                    infoCrawler.OnCompleted += async(s, e) =>
                    {
                        try
                        {
                            using (var db = new BizDataContext())
                            {
                                SaveInfomation(e.PageSource, crawler, xmlDocument, db);

                                await CommonHelper.SaveNewState((int)HTZ_ServiceState_ServiceStateEnum.Fine, crawler.id);
                            }
                        }
                        catch (Exception ex)
                        {
                            var messageException = new CrawlerException(crawler.id, "解析出错", ex.Message);
                            await SaveException(messageException);
                        }
                    };

                    if (!string.IsNullOrEmpty(crawler.url))
                    {
                        infoCrawler.Start(new Uri(crawler.url), crawler.encode).Wait();
                    }
                }
                catch (CrawlerException ex)
                {
                    await CommonHelper.SaveException(ex);
                }
                catch (Exception ex)
                {
                    var e = new CrawlerException()
                    {
                        crawlertype      = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                        exceptionbrief   = "信息抓取服务错误",
                        exceptionmessage = ex.Message,
                        statuscode       = 501,
                        serviceid        = 2
                    };
                    await CommonHelper.SaveException(e);
                }
            });
        }
示例#5
0
        /// <summary>
        /// 解析数据
        /// </summary>
        /// <param name="html">原始html</param>
        /// <param name="crawler">抓取服务配置</param>
        /// <param name="infoNode">xml配置数据</param>
        /// <param name="db">数据库链接</param>
        /// <param name="encode">html编码格式</param>
        /// <param name="url">抓取原始url</param>
        /// <returns></returns>
        public static void SaveInfomation(string html, v_crawler crawler, XmlDocument infoNode, BizDataContext db)
        {
            //页面数据的根节点
            var rootNode = CommonHelper.GetRootNode(html);

            //配置数据的根节点
            var xmlRoot = infoNode.SelectSingleNode("data");

            //获取信息列表的配置节点
            var listConfig = xmlRoot.SelectSingleNode("ListConfig");

            //这里是为了最终找到modelListNodes变量的值,也就是新闻列表
            //在此之前可能需要多次的剥离无效数据,因此使用了foreach循环
            //正常流程应该是多次SelectSingleNode后,进行一次SelectNodes,获取到新闻列表
            HtmlNode           modelNode      = null;
            HtmlNodeCollection modelListNodes = null;

            foreach (XmlNode item in listConfig.ChildNodes)
            {
                if (modelNode == null)
                {
                    modelNode = rootNode;
                }

                if (item.Attributes["issingleselect"].Value.ToBool())
                {
                    //多次剥离无效数据
                    modelNode = modelNode.SelectSingleNode(item.Attributes["signstr"].Value);
                }
                else
                {
                    //最终获取到信息列表,此时应该循环结束
                    modelListNodes = modelNode.SelectNodes(item.Attributes["signstr"].Value);
                    break;
                }
            }

            //获取对新闻实体解析的配置节点
            var infoConfig = xmlRoot.SelectSingleNode("InfoConfig");

            //对上面获取到的新闻列表循环处理
            foreach (HtmlNode info in modelListNodes)
            {
                T_Information entity = new T_Information();

                var detailUrl = string.Empty;

                //新闻解析应该包含多个子节点,每个子节点表示一个属性,这里进行循环赋值
                foreach (XmlNode property in infoConfig.ChildNodes)
                {
                    if (property.Name == "property")
                    {
                        entity = CommonHelper.GetProperty(entity, info, property);
                    }
                    else if (property.Name == "DetailUrl")
                    {
                        detailUrl = GetUrl(info, property, crawler.url);
                    }
                }

                var count = db.Set <T_Information>().Where(p => p.Information_Id == entity.Information_Id).Select(p => 1).Count();

                if (count >= 1)
                {
                    return;
                }

                entity.State           = (int)T_InformationStateEnum.Publish;
                entity.InformationType = (int)t_informationtypeenum.customcontent;
                entity.InfoTypeIds     = crawler.infotype + ",";


                if (!string.IsNullOrEmpty(detailUrl))
                {
                    //循环赋值完成后,前往新闻详情页获取新闻详情,完善新闻实体
                    InfomationDetailCrawler(detailUrl, xmlRoot, entity, crawler, crawler.encode);
                }

                entity.OriginalSourceUrl = detailUrl;
            }
        }
示例#6
0
        private async static Task SaveInfomationDetail(string pageSource, T_Information info, XmlNode infoNode, BizDataContext db, v_crawler crawler, string detailUrl)
        {
            var rootNode = CommonHelper.GetRootNode(pageSource);

            if (info != null)
            {
                var xmlRoot = infoNode.SelectSingleNode("DataDetail");

                var detailHost = new Regex(@"\S+/").Match(detailUrl).Value;

                //新闻解析应该包含多个子节点,每个子节点表示一个属性,这里进行循环赋值
                foreach (XmlNode property in xmlRoot.ChildNodes)
                {
                    info = CommonHelper.GetProperty(info, rootNode, property, detailHost);
                }


                var info_infotag = new T_InfoType_Information()
                {
                    InfoType_Information_Id = await db.GetNextIdentity_IntAsync(),
                    CreateTime    = DateTime.Now,
                    InformationId = info.Information_Id,
                    InfoTypeId    = crawler.infotype,
                };

                var informationcontent = new T_Information_Content()
                {
                    Information_Content_Id = await db.GetNextIdentity_IntAsync(),
                    Conent      = info.Content,
                    ContentType = (int)Information_Content_ContentTypeEnum.TextWords,

                    OrderIndex    = 0,
                    InformationId = info.Information_Id,
                    State         = 0,
                    CreateTime    = DateTime.Now,
                };

                info.ClassifyId  = (int)InfoSource.Crawler;
                info.PublishTime = info.CreateTime;
                info.IsTop       = false;



                if (!string.IsNullOrEmpty(informationcontent.Conent))
                {
                    var regex    = new Regex("<img.*?>");
                    var imgMtach = regex.Match(info.Content);
                    if (imgMtach.Success)
                    {
                        var img      = imgMtach.Value;
                        var srcMatch = new Regex("src=\".*?\"").Match(img);
                        if (srcMatch.Success)
                        {
                            var src = srcMatch.Value;

                            var att = new T_Attachment()
                            {
                                Attachment_ID = await db.GetNextIdentity_IntAsync(),
                                FilePath      = src,

                                State      = 0,
                                CreateTime = DateTime.Now,
                            };
                            await db.InsertAsync(att);
                        }
                    }

                    await db.InsertAsync(info);

                    await db.InsertAsync(info_infotag);

                    await db.InsertAsync(informationcontent);
                }
            }
        }
示例#7
0
        /// <summary>
        /// 抓取青网本地信息
        /// </summary>
        public static void InfomationDetailCrawler(string url, XmlNode infoNode, T_Information info, v_crawler crawler, string encode)
        {
            var infoDetailCrawler = new SimpleCrawler();//新建一个爬虫服务

            infoDetailCrawler.OnError += (s, e) =>
            {
                var ee = new CrawlerException()
                {
                    crawlertype      = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                    exceptionbrief   = "详情抓取出错",
                    exceptionmessage = e.Exception.Message,
                    statuscode       = 500,
                    serviceid        = crawler.id
                };
                throw ee;
            };
            infoDetailCrawler.OnCompleted += async(s, e) =>
            {
                try
                {
                    using (var db = new BizDataContext())
                    {
                        await SaveInfomationDetail(e.PageSource, info, infoNode, db, crawler, url);
                    }
                }
                catch (Exception ex)
                {
                    var ee = new CrawlerException()
                    {
                        crawlertype      = (int)HTZ_ExceptionHandler_ServiceTypeEnum.DataGrab,
                        exceptionbrief   = "详情解析出错",
                        exceptionmessage = ex.Message,
                        statuscode       = 500,
                        serviceid        = crawler.id
                    };
                    await CommonHelper.SaveException(ee);
                }
            };

            infoDetailCrawler.Start(new Uri(url), encode).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }