public ActionResult CheckGoGather(TaskEntity model) { var searchNum = new TargeCompanyDomain().GetInt( t => t.TaskGuid == model.Unique && t.IsSearched); return(Json(new { state = "doing", num = searchNum })); }
public ActionResult SingelSearch(string guid = null, string searchInfo = null) { CrawlerEntity crawlerEntity = null; if (string.IsNullOrWhiteSpace(guid) && string.IsNullOrWhiteSpace(searchInfo)) { ViewBag.Guid = Guid.NewGuid(); } else { ViewBag.Guid = guid; var count = new TargeCompanyDomain().Get(t => t.CompanyName.Equals(searchInfo))?.Count; if (count > 0) { //历史记录中已存在 } else { //上网检索 List <string> companyList = new List <string> { searchInfo }; TaskEntity model = new TaskEntity(); model.TaskType = EnumTaskType.BjCrawler; model.TaskName = $"单个任务[{DateTime.Now.ToString("G")}]"; model.Unique = Conv.ToGuid(guid); model.TaskStateDicId = 1; model.TaskNum = 1; model.CreateTime = DateTime.Now; model.IsSingelSearch = true; new TaskDomain().Add(model); new BaseData(model).InsertMetadata(companyList.ToList(), model.TaskName, model, taskEntity => { Task[] tasks = new Task[4]; for (int i = 0; i < 4; i++) { tasks[i] = new Task(() => { var bjqyxy = new Crawler.Bjqyxy.BjCrawler(taskEntity, t => t.TaskGuid.Equals(taskEntity.Unique)); bjqyxy.SingelSearch(searchInfo); }); tasks[i].Start(); } Task.WaitAny(tasks); }); } crawlerEntity = new CrawlerDomain().Get(t => t.搜索名称 == searchInfo && t.称 != null).FirstOrDefault(); } return(View(crawlerEntity)); }
public ActionResult CheckInsertMetadata(TaskEntity model) { var insertNum = new TargeCompanyDomain().GetInt(t => t.TaskGuid == model.Unique); return(Json(new { state = "doing", num = insertNum })); }
private void BaseWork() { bool isReloadCompany = true; //是否重新获取新的企业名称 string companyOld = ""; //记忆企业名称 var httpClient = new HttpClient(); httpClient.Setting.Timeout = 1000 * 5; httpClient.Create <string>(HttpMethod.Post, firsturl).Send(); while (true) { var targetModel = new CrawlerEntity { 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique }; try { //查询资源预处理 if (isReloadCompany) { var companyEntity = new TargeCompanyDomain().GetSingel(QueryCondition); if (companyEntity == null) { break; } targetModel.搜索名称 = companyEntity.CompanyName; //搜索名称,直接持久化 companyOld = targetModel.搜索名称; isReloadCompany = false; } else { targetModel.搜索名称 = companyOld; } //IP处理 var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP if (proxyEntity == null) { Console.WriteLine("在线代理临时获取策略启动。"); proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity(); Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port); } httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port); var resultBody = httpClient.Create <string>(HttpMethod.Post, targetUrl, data: new { queryStr = targetModel.搜索名称, module = "", idFlag = "qyxy" }).Send(); if (!resultBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultBody.Result)) { RemoveOldIp(proxyEntity); continue; } //提取二级连接 var parser = new JumonyParser(); var urls = parser.Parse(resultBody.Result).Find("li a").ToList(); var nextUrl = ""; if (urls.Count < 1) { isReloadCompany = true; AddNull(targetModel); continue; } foreach (var htmlElement in urls) { targetModel.称 = htmlElement.InnerText(); nextUrl = url + htmlElement.Attribute("href").AttributeValue; } //提取目标正文 var resultsecondBody = httpClient.Create <string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send(); var nameValueCollection = new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query)); if (!resultsecondBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultsecondBody.Result)) { RemoveOldIp(proxyEntity); continue; } //正文处理 var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td")); var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList(); var listall = new List <string>(); foreach (var tableList in tableLists) { tableList.Find("tr td") .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim())); } var fillModel = FillModel(listall); fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper(); new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel)); //后续其他处理 包括了IP使用状态,以查询列表状态 proxyEntity.Usage = proxyEntity.Usage + 1; new ProxyDomain().Update(proxyEntity); Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称); } catch (Exception e) { isReloadCompany = true; new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now }); continue; } isReloadCompany = true; } }