コード例 #1
0
ファイル: BjCrawler.cs プロジェクト: li-keli/LiGather
        /// <summary>
        /// 生成策略
        /// </summary>
        /// <param name="fillModel">数据容器</param>
        /// <param name="appendEntity">补充用数据容器</param>
        /// <returns></returns>
        private CrawlerEntity StrategyNo1(CrawlerEntity fillModel, CrawlerEntity appendEntity)
        {
            fillModel.搜索名称     = appendEntity.搜索名称;
            fillModel.TaskGuid = appendEntity.TaskGuid;
            fillModel.操作人姓名    = appendEntity.操作人姓名;
            fillModel.称        = string.IsNullOrWhiteSpace(fillModel.称) ? appendEntity.称 : fillModel.称;
            fillModel.入爬行库时间   = appendEntity.入爬行库时间;
            fillModel.爬行更新时间   = DateTime.Now;

            if (!string.IsNullOrWhiteSpace(fillModel.注册资本))
            {
                var split = fillModel.注册资本?.Split(' ');
                if (split.Length > 0)
                {
                    fillModel.注册资本   = split[0].Trim();
                    fillModel.实收资本   = fillModel.实收资本?.Split(' ')[0];
                    fillModel.实缴出资金额 = fillModel.实缴出资金额?.Split(' ')[0];
                }
                if (split.Length > 1)
                {
                    fillModel.资金单位 = split[1].Trim();
                }
                if (split.Length > 2)
                {
                    fillModel.币种 = split[2].Trim();
                }
            }
            return(fillModel);
        }
コード例 #2
0
ファイル: CrawlerDomain.cs プロジェクト: li-keli/LiGather
 /// <summary>
 /// 写入新内容,若是存在就不写入
 /// </summary>
 /// <param name="model"></param>
 public void Add(CrawlerEntity model)
 {
     using (LiGatherContext db = new LiGatherContext())
     {
         if (!db.CrawlerEntities.Any(t => t.搜索名称.Equals(model.搜索名称)))
         {
             db.CrawlerEntities.Add(model);
             db.SaveChanges();
         }
     }
 }
コード例 #3
0
ファイル: CrawlerController.cs プロジェクト: li-keli/LiGather
        public ActionResult SingelSearch(string guid = null, string searchInfo = null)
        {
            CrawlerEntity crawlerEntity = null;

            if (string.IsNullOrWhiteSpace(guid) && string.IsNullOrWhiteSpace(searchInfo))
            {
                ViewBag.Guid = Guid.NewGuid();
            }
            else
            {
                ViewBag.Guid = guid;
                var count = new TargeCompanyDomain().Get(t => t.CompanyName.Equals(searchInfo))?.Count;
                if (count > 0)
                {
                    //历史记录中已存在
                }
                else
                {
                    //上网检索
                    List <string> companyList = new List <string> {
                        searchInfo
                    };
                    TaskEntity model = new TaskEntity();
                    model.TaskType       = EnumTaskType.BjCrawler;
                    model.TaskName       = $"单个任务[{DateTime.Now.ToString("G")}]";
                    model.Unique         = Conv.ToGuid(guid);
                    model.TaskStateDicId = 1;
                    model.TaskNum        = 1;
                    model.CreateTime     = DateTime.Now;
                    model.IsSingelSearch = true;
                    new TaskDomain().Add(model);
                    new BaseData(model).InsertMetadata(companyList.ToList(), model.TaskName, model, taskEntity =>
                    {
                        Task[] tasks = new Task[4];
                        for (int i = 0; i < 4; i++)
                        {
                            tasks[i] = new Task(() =>
                            {
                                var bjqyxy = new Crawler.Bjqyxy.BjCrawler(taskEntity, t => t.TaskGuid.Equals(taskEntity.Unique));
                                bjqyxy.SingelSearch(searchInfo);
                            });
                            tasks[i].Start();
                        }
                        Task.WaitAny(tasks);
                    });
                }
                crawlerEntity = new CrawlerDomain().Get(t => t.搜索名称 == searchInfo && t.称 != null).FirstOrDefault();
            }
            return(View(crawlerEntity));
        }
コード例 #4
0
ファイル: BjCrawler.cs プロジェクト: li-keli/LiGather
        /// <summary>
        /// 生成抓取的充血模型
        /// </summary>
        /// <param name="scoreList"></param>
        /// <returns></returns>
        private CrawlerEntity FillModel(List <string> scoreList)
        {
            var model = new CrawlerEntity();
            var info  = model.GetType().GetProperties();

            foreach (var item in info)
            {
                var name  = item.Name;
                var index = scoreList.FindIndex(c => c.Equals(name));
                if (index < 0)
                {
                    continue;
                }
                item.SetValue(model, scoreList[index + 1], null);
                //if (item.PropertyType == typeof(DateTime?))
                //    item.SetValue(model, Conv.ToDateOrNull(scoreList[index + 1]), null);
                //else
                //    item.SetValue(model, scoreList[index + 1], null);
            }
            return(model);
        }
コード例 #5
0
ファイル: BjCrawler.cs プロジェクト: li-keli/LiGather
 /// <summary>
 /// 因为异常或者没有检索到值 执行空插入
 /// </summary>
 private void AddNull(CrawlerEntity targetModel)
 {
     new CrawlerDomain().Add(targetModel);
     Console.WriteLine("{0} 空对象:{1}", Task.CurrentId, targetModel.搜索名称);
 }
コード例 #6
0
ファイル: BjCrawler.cs プロジェクト: li-keli/LiGather
        /// <summary>
        /// 单个查询
        /// </summary>
        /// <param name="companyName"></param>
        public void SingelSearch(string companyName)
        {
            var httpClient = new HttpClient();

            httpClient.Setting.Timeout = 1000 * 5;
            httpClient.Create <string>(HttpMethod.Post, firsturl).Send();
            while (true)
            {
                var targetModel = new CrawlerEntity {
                    搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique
                };
                try
                {
                    //IP处理
                    var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP
                    if (proxyEntity == null)
                    {
                        Console.WriteLine("在线代理临时获取策略启动。");
                        proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity();
                        Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port);
                    }

                    httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port);

                    var resultBody = httpClient.Create <string>(HttpMethod.Post, targetUrl, data: new
                    {
                        queryStr = targetModel.搜索名称,
                        module   = "",
                        idFlag   = "qyxy"
                    }).Send();
                    if (!resultBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //提取二级连接
                    var parser  = new JumonyParser();
                    var urls    = parser.Parse(resultBody.Result).Find("li a").ToList();
                    var nextUrl = "";
                    if (urls.Count < 1)
                    {
                        AddNull(targetModel);
                        break;
                    }
                    foreach (var htmlElement in urls)
                    {
                        targetModel.称 = htmlElement.InnerText();
                        nextUrl       = url + htmlElement.Attribute("href").AttributeValue;
                    }
                    //提取目标正文
                    var resultsecondBody =
                        httpClient.Create <string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send();
                    var nameValueCollection =
                        new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query));
                    if (!resultsecondBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultsecondBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //正文处理
                    var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td"));
                    var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList();
                    var listall    = new List <string>();
                    foreach (var tableList in tableLists)
                    {
                        tableList.Find("tr td")
                        .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim()));
                    }
                    var fillModel = FillModel(listall);
                    fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper();
                    new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel));
                    //后续其他处理 包括了IP使用状态,以查询列表状态
                    proxyEntity.Usage = proxyEntity.Usage + 1;
                    new ProxyDomain().Update(proxyEntity);
                    Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称);
                }
                catch (Exception e)
                {
                    new LogDomain().Add(new LogEntity {
                        LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now
                    });
                    continue;
                }
                break;
            }
        }