Ejemplo n.º 1
0
 /// <summary>
 /// 写入新内容,若是存在就不写入
 /// </summary>
 /// <param name="model"></param>
 public void Add(CrawlerEntity model)
 {
     using (LiGatherContext db = new LiGatherContext())
     {
         if (!db.CrawlerEntities.Any(t => t.搜索名称.Equals(model.搜索名称)))
         {
             db.CrawlerEntities.Add(model);
             db.SaveChanges();
         }
     }
 }
Ejemplo n.º 2
0
        /// <summary>
        /// 生成策略
        /// </summary>
        /// <param name="fillModel">数据容器</param>
        /// <param name="appendEntity">补充用数据容器</param>
        /// <returns></returns>
        private CrawlerEntity StrategyNo1(CrawlerEntity fillModel, CrawlerEntity appendEntity)
        {
            fillModel.搜索名称 = appendEntity.搜索名称;
            fillModel.TaskGuid = appendEntity.TaskGuid;
            fillModel.操作人姓名 = appendEntity.操作人姓名;
            fillModel.名称 = string.IsNullOrWhiteSpace(fillModel.名称) ? appendEntity.名称 : fillModel.名称;
            fillModel.入爬行库时间 = appendEntity.入爬行库时间;
            fillModel.爬行更新时间 = DateTime.Now;

            if (!string.IsNullOrWhiteSpace(fillModel.注册资本))
            {
                var split = fillModel.注册资本?.Split(' ');
                if (split.Length > 0)
                {
                    fillModel.注册资本 = split[0].Trim();
                    fillModel.实收资本 = fillModel.实收资本?.Split(' ')[0];
                    fillModel.实缴出资金额 = fillModel.实缴出资金额?.Split(' ')[0];
                }
                if (split.Length > 1)
                {
                    fillModel.资金单位 = split[1].Trim();
                }
                if (split.Length > 2)
                {
                    fillModel.币种 = split[2].Trim();
                }
            }
            return fillModel;
        }
Ejemplo n.º 3
0
 /// <summary>
 /// 生成抓取的充血模型
 /// </summary>
 /// <param name="scoreList"></param>
 /// <returns></returns>
 private CrawlerEntity FillModel(List<string> scoreList)
 {
     var model = new CrawlerEntity();
     var info = model.GetType().GetProperties();
     foreach (var item in info)
     {
         var name = item.Name;
         var index = scoreList.FindIndex(c => c.Equals(name));
         if (index < 0)
             continue;
         item.SetValue(model, scoreList[index + 1], null);
         //if (item.PropertyType == typeof(DateTime?))
         //    item.SetValue(model, Conv.ToDateOrNull(scoreList[index + 1]), null);
         //else
         //    item.SetValue(model, scoreList[index + 1], null);
     }
     return model;
 }
Ejemplo n.º 4
0
 /// <summary>
 /// 因为异常或者没有检索到值 执行空插入
 /// </summary>
 private void AddNull(CrawlerEntity targetModel)
 {
     new CrawlerDomain().Add(targetModel);
     Console.WriteLine("{0} 空对象:{1}", Task.CurrentId, targetModel.搜索名称);
 }
Ejemplo n.º 5
0
        /// <summary>
        /// 单个查询
        /// </summary>
        /// <param name="companyName"></param>
        public void SingelSearch(string companyName)
        {
            var httpClient = new HttpClient();
            httpClient.Setting.Timeout = 1000 * 5;
            httpClient.Create<string>(HttpMethod.Post, firsturl).Send();
            while (true)
            {
                var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique };
                try
                {
                    //IP处理
                    var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP
                    if (proxyEntity == null)
                    {
                        Console.WriteLine("在线代理临时获取策略启动。");
                        proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity();
                        Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port);
                    }

                    httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port);

                    var resultBody = httpClient.Create<string>(HttpMethod.Post, targetUrl, data: new
                    {
                        queryStr = targetModel.搜索名称,
                        module = "",
                        idFlag = "qyxy"
                    }).Send();
                    if (!resultBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //提取二级连接
                    var parser = new JumonyParser();
                    var urls = parser.Parse(resultBody.Result).Find("li a").ToList();
                    var nextUrl = "";
                    if (urls.Count < 1)
                    {
                        AddNull(targetModel);
                        break;
                    }
                    foreach (var htmlElement in urls)
                    {
                        targetModel.名称 = htmlElement.InnerText();
                        nextUrl = url + htmlElement.Attribute("href").AttributeValue;
                    }
                    //提取目标正文
                    var resultsecondBody =
                        httpClient.Create<string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send();
                    var nameValueCollection =
                        new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query));
                    if (!resultsecondBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultsecondBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //正文处理
                    var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td"));
                    var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList();
                    var listall = new List<string>();
                    foreach (var tableList in tableLists)
                        tableList.Find("tr td")
                            .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim()));
                    var fillModel = FillModel(listall);
                    fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper();
                    new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel));
                    //后续其他处理 包括了IP使用状态,以查询列表状态
                    proxyEntity.Usage = proxyEntity.Usage + 1;
                    new ProxyDomain().Update(proxyEntity);
                    Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称);
                }
                catch (Exception e)
                {
                    new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now });
                    continue;
                }
                break;
            }
        }