/// <summary> /// 单个查询 /// </summary> /// <param name="companyName"></param> public void SingelSearch(string companyName) { var httpClient = new HttpClient(); httpClient.Setting.Timeout = 1000 * 5; httpClient.Create<string>(HttpMethod.Post, firsturl).Send(); while (true) { var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique }; try { //IP处理 var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP if (proxyEntity == null) { Console.WriteLine("在线代理临时获取策略启动。"); proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity(); Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port); } httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port); var resultBody = httpClient.Create<string>(HttpMethod.Post, targetUrl, data: new { queryStr = targetModel.搜索名称, module = "", idFlag = "qyxy" }).Send(); if (!resultBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultBody.Result)) { RemoveOldIp(proxyEntity); continue; } //提取二级连接 var parser = new JumonyParser(); var urls = parser.Parse(resultBody.Result).Find("li a").ToList(); var nextUrl = ""; if (urls.Count < 1) { AddNull(targetModel); break; } foreach (var htmlElement in urls) { targetModel.名称 = htmlElement.InnerText(); nextUrl = url + htmlElement.Attribute("href").AttributeValue; } //提取目标正文 var resultsecondBody = httpClient.Create<string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send(); var nameValueCollection = new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query)); if (!resultsecondBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultsecondBody.Result)) { RemoveOldIp(proxyEntity); continue; } //正文处理 var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td")); var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList(); var listall = new List<string>(); foreach (var tableList in tableLists) tableList.Find("tr td") .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim())); var fillModel = FillModel(listall); fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper(); new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel)); //后续其他处理 包括了IP使用状态,以查询列表状态 proxyEntity.Usage = proxyEntity.Usage + 1; new ProxyDomain().Update(proxyEntity); Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称); } catch (Exception e) { new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now }); continue; } break; } }
/// <summary> /// 验证Ip可用性 /// </summary> /// <param name="maxThreadNum">启用多少线程验证</param> public void ValidateCanUse(int maxThreadNum = 1) { var tasks = new Task[maxThreadNum]; for (var i = 0; i < maxThreadNum; i++) { tasks[i] = new Task(() => { for (var j = 0; j < MaxValue; j++) { ProxyDomain proxyDomain = new ProxyDomain(); var id = new Random().Next(1, proxyDomain.GetMaxId()); var proxyEntity = proxyDomain.GetById(id); if (proxyEntity.Id == 0) continue; if (ThreadValidate.Doit(proxyEntity)) break; } Console.WriteLine("完成"); }); tasks[i].Start(); } Task.WaitAll(tasks); }
/// <summary> /// 直接采集IP并更新到数据库 /// </summary> /// <param name="countNum">采集IP总数</param> /// <param name="getNum">每次提取数量</param> /// <param name="isValidate">是否对代理验证</param> public void ProxySave(int countNum = 100, int getNum = 5, bool isValidate = false) { new Thread(() => { while (true) { if (countNum == 0) break; var ipLists = GetProxyByHttp(getNum).Split(Environment.NewLine.ToCharArray()).ExceptNull().ToList(); foreach (var ipList in ipLists) { var ipAndPort = ipList.Split(':'); if (string.IsNullOrWhiteSpace(ipList) || ipAndPort.Length < 2) continue; Console.WriteLine("获取到代理:" + ipList); var model = new ProxyEntity(); model.IpAddress = ipAndPort[0]; model.Port = Conv.ToInt(ipAndPort[1]); model.Usage = 0; model.CreateTime = DateTime.Now; ProxyDomain proxyDomain = new ProxyDomain(); if (isValidate) { if (ThreadValidate.VerificationIp(model.IpAddress, model.Port)) { if (!proxyDomain.IsExist(model)) proxyDomain.Add(model); countNum--; } } if (!proxyDomain.IsExist(model)) proxyDomain.Add(model); countNum--; } } Console.WriteLine("IP采集完毕"); }) { IsBackground = false }.Start(); }
/// <summary> /// 通过在线代理直接获取未验证的IP代理 /// 主要用于本地IP代理库存不足,临时获取 /// </summary> /// <returns></returns> public ProxyEntity GetHttProxyEntity() { var ipList = GetProxyByHttp(); var ipAndPort = ipList.Split(':'); var model = new ProxyEntity(); model.IpAddress = ipAndPort[0]; model.Port = Conv.ToInt(ipAndPort[1]); model.Usage = 1; model.CanUse = true; model.CreateTime = DateTime.Now; model.LastUseTime = DateTime.Now; ProxyDomain proxyDomain = new ProxyDomain(); if (!proxyDomain.IsExist(model)) proxyDomain.Add(model); return model; }