/// <summary> /// 写入新内容,若是存在就不写入 /// </summary> /// <param name="model"></param> public void Add(CrawlerEntity model) { using (LiGatherContext db = new LiGatherContext()) { if (!db.CrawlerEntities.Any(t => t.搜索名称.Equals(model.搜索名称))) { db.CrawlerEntities.Add(model); db.SaveChanges(); } } }
/// <summary> /// 生成策略 /// </summary> /// <param name="fillModel">数据容器</param> /// <param name="appendEntity">补充用数据容器</param> /// <returns></returns> private CrawlerEntity StrategyNo1(CrawlerEntity fillModel, CrawlerEntity appendEntity) { fillModel.搜索名称 = appendEntity.搜索名称; fillModel.TaskGuid = appendEntity.TaskGuid; fillModel.操作人姓名 = appendEntity.操作人姓名; fillModel.名称 = string.IsNullOrWhiteSpace(fillModel.名称) ? appendEntity.名称 : fillModel.名称; fillModel.入爬行库时间 = appendEntity.入爬行库时间; fillModel.爬行更新时间 = DateTime.Now; if (!string.IsNullOrWhiteSpace(fillModel.注册资本)) { var split = fillModel.注册资本?.Split(' '); if (split.Length > 0) { fillModel.注册资本 = split[0].Trim(); fillModel.实收资本 = fillModel.实收资本?.Split(' ')[0]; fillModel.实缴出资金额 = fillModel.实缴出资金额?.Split(' ')[0]; } if (split.Length > 1) { fillModel.资金单位 = split[1].Trim(); } if (split.Length > 2) { fillModel.币种 = split[2].Trim(); } } return fillModel; }
/// <summary> /// 生成抓取的充血模型 /// </summary> /// <param name="scoreList"></param> /// <returns></returns> private CrawlerEntity FillModel(List<string> scoreList) { var model = new CrawlerEntity(); var info = model.GetType().GetProperties(); foreach (var item in info) { var name = item.Name; var index = scoreList.FindIndex(c => c.Equals(name)); if (index < 0) continue; item.SetValue(model, scoreList[index + 1], null); //if (item.PropertyType == typeof(DateTime?)) // item.SetValue(model, Conv.ToDateOrNull(scoreList[index + 1]), null); //else // item.SetValue(model, scoreList[index + 1], null); } return model; }
/// <summary> /// 因为异常或者没有检索到值 执行空插入 /// </summary> private void AddNull(CrawlerEntity targetModel) { new CrawlerDomain().Add(targetModel); Console.WriteLine("{0} 空对象:{1}", Task.CurrentId, targetModel.搜索名称); }
/// <summary> /// 单个查询 /// </summary> /// <param name="companyName"></param> public void SingelSearch(string companyName) { var httpClient = new HttpClient(); httpClient.Setting.Timeout = 1000 * 5; httpClient.Create<string>(HttpMethod.Post, firsturl).Send(); while (true) { var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique }; try { //IP处理 var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP if (proxyEntity == null) { Console.WriteLine("在线代理临时获取策略启动。"); proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity(); Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port); } httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port); var resultBody = httpClient.Create<string>(HttpMethod.Post, targetUrl, data: new { queryStr = targetModel.搜索名称, module = "", idFlag = "qyxy" }).Send(); if (!resultBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultBody.Result)) { RemoveOldIp(proxyEntity); continue; } //提取二级连接 var parser = new JumonyParser(); var urls = parser.Parse(resultBody.Result).Find("li a").ToList(); var nextUrl = ""; if (urls.Count < 1) { AddNull(targetModel); break; } foreach (var htmlElement in urls) { targetModel.名称 = htmlElement.InnerText(); nextUrl = url + htmlElement.Attribute("href").AttributeValue; } //提取目标正文 var resultsecondBody = httpClient.Create<string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send(); var nameValueCollection = new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query)); if (!resultsecondBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultsecondBody.Result)) { RemoveOldIp(proxyEntity); continue; } //正文处理 var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td")); var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList(); var listall = new List<string>(); foreach (var tableList in tableLists) tableList.Find("tr td") .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim())); var fillModel = FillModel(listall); fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper(); new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel)); //后续其他处理 包括了IP使用状态,以查询列表状态 proxyEntity.Usage = proxyEntity.Usage + 1; new ProxyDomain().Update(proxyEntity); Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称); } catch (Exception e) { new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now }); continue; } break; } }