/// <summary> /// 生成策略 /// </summary> /// <param name="fillModel">数据容器</param> /// <param name="appendEntity">补充用数据容器</param> /// <returns></returns> private CrawlerEntity StrategyNo1(CrawlerEntity fillModel, CrawlerEntity appendEntity) { fillModel.搜索名称 = appendEntity.搜索名称; fillModel.TaskGuid = appendEntity.TaskGuid; fillModel.操作人姓名 = appendEntity.操作人姓名; fillModel.称 = string.IsNullOrWhiteSpace(fillModel.称) ? appendEntity.称 : fillModel.称; fillModel.入爬行库时间 = appendEntity.入爬行库时间; fillModel.爬行更新时间 = DateTime.Now; if (!string.IsNullOrWhiteSpace(fillModel.注册资本)) { var split = fillModel.注册资本?.Split(' '); if (split.Length > 0) { fillModel.注册资本 = split[0].Trim(); fillModel.实收资本 = fillModel.实收资本?.Split(' ')[0]; fillModel.实缴出资金额 = fillModel.实缴出资金额?.Split(' ')[0]; } if (split.Length > 1) { fillModel.资金单位 = split[1].Trim(); } if (split.Length > 2) { fillModel.币种 = split[2].Trim(); } } return(fillModel); }
/// <summary> /// 写入新内容,若是存在就不写入 /// </summary> /// <param name="model"></param> public void Add(CrawlerEntity model) { using (LiGatherContext db = new LiGatherContext()) { if (!db.CrawlerEntities.Any(t => t.搜索名称.Equals(model.搜索名称))) { db.CrawlerEntities.Add(model); db.SaveChanges(); } } }
public ActionResult SingelSearch(string guid = null, string searchInfo = null) { CrawlerEntity crawlerEntity = null; if (string.IsNullOrWhiteSpace(guid) && string.IsNullOrWhiteSpace(searchInfo)) { ViewBag.Guid = Guid.NewGuid(); } else { ViewBag.Guid = guid; var count = new TargeCompanyDomain().Get(t => t.CompanyName.Equals(searchInfo))?.Count; if (count > 0) { //历史记录中已存在 } else { //上网检索 List <string> companyList = new List <string> { searchInfo }; TaskEntity model = new TaskEntity(); model.TaskType = EnumTaskType.BjCrawler; model.TaskName = $"单个任务[{DateTime.Now.ToString("G")}]"; model.Unique = Conv.ToGuid(guid); model.TaskStateDicId = 1; model.TaskNum = 1; model.CreateTime = DateTime.Now; model.IsSingelSearch = true; new TaskDomain().Add(model); new BaseData(model).InsertMetadata(companyList.ToList(), model.TaskName, model, taskEntity => { Task[] tasks = new Task[4]; for (int i = 0; i < 4; i++) { tasks[i] = new Task(() => { var bjqyxy = new Crawler.Bjqyxy.BjCrawler(taskEntity, t => t.TaskGuid.Equals(taskEntity.Unique)); bjqyxy.SingelSearch(searchInfo); }); tasks[i].Start(); } Task.WaitAny(tasks); }); } crawlerEntity = new CrawlerDomain().Get(t => t.搜索名称 == searchInfo && t.称 != null).FirstOrDefault(); } return(View(crawlerEntity)); }
/// <summary> /// 生成抓取的充血模型 /// </summary> /// <param name="scoreList"></param> /// <returns></returns> private CrawlerEntity FillModel(List <string> scoreList) { var model = new CrawlerEntity(); var info = model.GetType().GetProperties(); foreach (var item in info) { var name = item.Name; var index = scoreList.FindIndex(c => c.Equals(name)); if (index < 0) { continue; } item.SetValue(model, scoreList[index + 1], null); //if (item.PropertyType == typeof(DateTime?)) // item.SetValue(model, Conv.ToDateOrNull(scoreList[index + 1]), null); //else // item.SetValue(model, scoreList[index + 1], null); } return(model); }
/// <summary> /// 因为异常或者没有检索到值 执行空插入 /// </summary> private void AddNull(CrawlerEntity targetModel) { new CrawlerDomain().Add(targetModel); Console.WriteLine("{0} 空对象:{1}", Task.CurrentId, targetModel.搜索名称); }
/// <summary> /// 单个查询 /// </summary> /// <param name="companyName"></param> public void SingelSearch(string companyName) { var httpClient = new HttpClient(); httpClient.Setting.Timeout = 1000 * 5; httpClient.Create <string>(HttpMethod.Post, firsturl).Send(); while (true) { var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique }; try { //IP处理 var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP if (proxyEntity == null) { Console.WriteLine("在线代理临时获取策略启动。"); proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity(); Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port); } httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port); var resultBody = httpClient.Create <string>(HttpMethod.Post, targetUrl, data: new { queryStr = targetModel.搜索名称, module = "", idFlag = "qyxy" }).Send(); if (!resultBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultBody.Result)) { RemoveOldIp(proxyEntity); continue; } //提取二级连接 var parser = new JumonyParser(); var urls = parser.Parse(resultBody.Result).Find("li a").ToList(); var nextUrl = ""; if (urls.Count < 1) { AddNull(targetModel); break; } foreach (var htmlElement in urls) { targetModel.称 = htmlElement.InnerText(); nextUrl = url + htmlElement.Attribute("href").AttributeValue; } //提取目标正文 var resultsecondBody = httpClient.Create <string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send(); var nameValueCollection = new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query)); if (!resultsecondBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultsecondBody.Result)) { RemoveOldIp(proxyEntity); continue; } //正文处理 var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td")); var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList(); var listall = new List <string>(); foreach (var tableList in tableLists) { tableList.Find("tr td") .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim())); } var fillModel = FillModel(listall); fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper(); new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel)); //后续其他处理 包括了IP使用状态,以查询列表状态 proxyEntity.Usage = proxyEntity.Usage + 1; new ProxyDomain().Update(proxyEntity); Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称); } catch (Exception e) { new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now }); continue; } break; } }