public void Execute(IJobExecutionContext context) { try { if (!isRun) { isRun = true; object objParam = context.JobDetail.JobDataMap.Get("TaskParam"); if (objParam != null) { ProxyParam Param = JsonConvert.DeserializeObject <ProxyParam>(objParam.ToString()); DateTime start = DateTime.Now; LogHelper.WriteLog("\r\n\r\n\r\n\r\n------------------爬虫开始执行获取代理ip任务 " + start.ToString("yyyy-MM-dd HH:mm:ss") + " BEGIN-----------------------------\r\n\r\n"); //每执行10次任务,换一个代理IP if (NeedChangeIP || ExecuteCount % Speed == 0) { if (NeedChangeIP) { ExecuteCount = (ExecuteCount / Speed + 1) * Speed; } LogHelper.WriteLog("\r\n\r\n\r\n\r\n------------------开始解析使用的代理ip " + DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + " BEGIN-----------------------------\r\n\r\n"); ProxyIp = IpProxyGet.GetCorrectIP(Param); LogHelper.WriteLog("------------------保存使用的代理ip:" + ProxyIp + " -----------------------------"); SQLHelper.ExecuteNonQuery("INSERT INTO dbo.p_ProxyIPUseHistory(ProxyIP,Type) VALUES (@ProxyIP,'IpProxyJob')", new { ProxyIP = ProxyIp }); NeedChangeIP = false; } Param.ProxyIp = ProxyIp; LogHelper.WriteLog("\r\n\r\n\r\n\r\n------------------任务使用的代理ip:" + Param.ProxyIp + "----------------------------\r\n\r\n"); List <IPProxy> list = IpProxyGet.ParseProxy(Param); if (list.Count == 0) { //没有返回数据.表示当前IP已经被锁定需要更换 NeedChangeIP = true; } DateTime end = DateTime.Now; ExecuteCount++; LogHelper.WriteLog("\r\n\r\n------------------爬虫完成获取代理ip任务:" + end.ToString("yyyy-MM-dd HH:mm:ss") + ",本次共耗时(分):" + (end - start).TotalMinutes + " END------------------------\r\n\r\n\r\n\r\n"); } isRun = false; } } catch (Exception ex) { JobExecutionException e2 = new JobExecutionException(ex); LogHelper.WriteLog("爬虫获取代理ip任务异常", ex); isRun = false; ExecuteCount++; //1.立即重新执行任务 e2.RefireImmediately = true; //2 立即停止所有相关这个任务的触发器 //e2.UnscheduleAllTriggers=true; } }
/// <summary> /// 从数据库里面拿出一个可以使用的代理ip /// </summary> /// <param name="param"></param> /// <returns></returns> public string GetCorrectIp(ProxyParam param) { IProxyUseHistoryService proxyUseHistoryService = new ProxyUseHistoryService(); string proxyIp = string.Empty; //当前页 int currentPage = 1; int PageSize = 100; var proxyType = ProxyType.Http.GetDescription(); var proxyJobType = ProxyUseForType.ProxyJob.GetDescription(); var updateProxies = new List <Proxy>(); while (string.IsNullOrEmpty(proxyIp)) { int total; var proxyUnUseList = LoadPageEntities(currentPage * PageSize, (currentPage - 1) * PageSize + 1, out total, r => !r.IsDelete && r.Type.Equals(proxyType), o => o.Speed, true); var proxyUsedList = proxyUseHistoryService.LoadEntities(r => r.Type.Equals(proxyJobType)).Select(p => p.ProxyGuid); var proxyList = proxyUnUseList.Where(r => !proxyUsedList.Contains(r.Guid)).ToList(); if (proxyList.Count == 0) { break; } foreach (var item in proxyList) { //检查是否能ping通并且可以代理拿到网页 if (WebUtils.PingProxy(item.IP, item.Port) && ProxyUtil.GetTotalPage(param.IpUrl, $"{item.IP}:{item.Port}") > 1) { proxyIp = $"{item.IP}:{item.Port}"; proxyUseHistoryService.Add(new ProxyUseHistory() { Guid = Guid.NewGuid(), ProxyGuid = item.Guid, CreatedOn = DateTime.Now, Type = proxyJobType }); break; } else { item.IsDelete = true; updateProxies.Add(item); } } currentPage++; } //将不能使用的Ip删除 if (updateProxies.Count > 0) { Update(updateProxies, r => r.IsDelete); } if (string.IsNullOrEmpty(proxyIp)) { proxyIp = param.DefaultProxyIp; } return(proxyIp); }
/// <summary> /// 开始解析网站数据 /// </summary> /// <param name="param"></param> /// <returns></returns> public static List <DBModels.Base.Proxy> ParseProxy(ProxyParam param) { if (string.IsNullOrEmpty(param.IpUrl)) { throw new ArgumentNullException("ParseProxy函数参数空异常"); } //总页数 int total = GetTotalPage(param.IpUrl, param.ProxyIp); if (total == 1) { throw new ArgumentNullException("总页数信息异常"); } //返回结果 List <DBModels.Base.Proxy> list = new List <DBModels.Base.Proxy>(); //多线程进行解析获取 List <Thread> listThread = new List <Thread>(); //每个线程需要解析的页面数量 var threadPageCount = (total / CpuCount); int threadPqgeSize = threadPageCount == 0? 1 : threadPageCount; //为每个线程准备参数 List <Hashtable> threadParams = new List <Hashtable>(); int start = 0, end = 0; Hashtable table = null; //平均分配到每个线程 for (int i = 0; i < CpuCount; i++) { start = i * threadPqgeSize + 1; if (start == total || total <= CpuCount) { i = CpuCount; end = total; } else { end = start + threadPqgeSize - 1; if (i == CpuCount - 1 && end < total)//如果还有余数就都分配在最后的线程 { end = total; } } table = new Hashtable(); table.Add("start", start); table.Add("end", end); table.Add("list", list); table.Add("param", param); threadParams.Add(table); Thread thread = new Thread(DoWork) { IsBackground = true, Name = "PageParse #" + i.ToString() }; LogHelper.WriteInfoLog($"线程{thread.Name}已开启,Start:{start},End:{end}"); listThread.Add(thread); thread.Start(threadParams[i]); } //for (int i = 0; i < CpuCount; i++) //{ // Thread thread = new Thread(DoWork) // { // IsBackground = true, // Name = "PageParse #" + i.ToString() // }; // listThread.Add(thread); // thread.Start(threadParams[i]); //} // 为当前线程指派生成任务。 // DoWork(threadParams[0]); // 等待所有的编译线程执行线束。 foreach (Thread thread in listThread) { thread.Join(); } if (list.Count == 0) { LogHelper.WriteInfoLog("爬虫-代理ip任务,没有获取到数据,可能当前ip(" + param.ProxyIp + ")已被服务器封锁"); } return(list); }
/// <summary> /// 解析每一页数据 /// </summary> /// <param name="param"></param> private static void DoWork(object param) { //参数还原 Hashtable table = param as Hashtable; if (table == null) { throw new ArgumentNullException(nameof(table)); } int start = Convert.ToInt32(table["start"]); int end = Convert.ToInt32(table["end"]); List <DBModels.Base.Proxy> list = table["list"] as List <DBModels.Base.Proxy>; if (list == null) { throw new ArgumentNullException(nameof(list)); } ProxyParam proxyParam = table["param"] as ProxyParam; if (proxyParam == null) { throw new ArgumentNullException(nameof(proxyParam)); } //页面地址 string url = string.Empty; string ip = string.Empty; DBModels.Base.Proxy item = null; HtmlNodeCollection nodes = null; HtmlNode node = null; HtmlAttribute atr = null; for (int i = start; i <= end; i++) { LogHelper.WriteAsyncLog($"开始解析,页码{start}~{end},当前页码{i}"); url = $"{proxyParam.IpUrl}/{i}"; var doc = new HtmlDocument(); //如果代理失效会带来问题 doc.LoadHtml(GetHtml(url, proxyParam.ProxyIp)); //获取所有数据节点tr 解析每一个ip int count = 0; var trs = doc.DocumentNode.SelectNodes(@"//table[@id='ip_list']/tr"); if (trs != null && trs.Count > 1) { LogHelper.WriteAsyncLog($"当前页码{i},请求地址{url},共{trs.Count}条数据"); for (int j = 1; j < trs.Count; j++) { nodes = trs[j].SelectNodes("td"); if (nodes != null && nodes.Count > 9) { ip = nodes[2].InnerText.Trim(); var port = nodes[3].InnerText.Trim(); //LogHelper.WriteAsyncLog($"开始验证IP:{ip}:{port}"); if (proxyParam.IsPingIp && !WebUtils.PingProxy(ip, port))//!WebUtils.Ping(ip) // GetTotalPage(proxyParam.IpUrl, $"{ip}:{port}") <=1 { //LogHelper.WriteInfoLog($"验证IP不通过:{ip}:{port}"); continue; } count++; LogHelper.WriteAsyncLog($"验证IP成功:{ip}:{port}"); //有效的IP才添加 item = new DBModels.Base.Proxy(); node = nodes[1].FirstChild; if (node != null) { atr = node.Attributes["alt"]; if (atr != null) { item.Country = atr.Value.Trim(); } } item.IP = ip; item.Port = port; item.ProxyIp = $"{item.IP}:{item.Port}"; item.Position = nodes[4].InnerText.Trim(); item.Anonymity = nodes[5].InnerText.Trim(); item.Type = nodes[6].InnerText.Trim(); item.Guid = Guid.NewGuid(); item.CreatedOn = DateTime.Now; node = nodes[7].SelectSingleNode("div[@class='bar']"); if (node != null) { atr = node.Attributes["title"]; if (atr != null) { item.Speed = atr.Value.Trim(); } } node = nodes[8].SelectSingleNode("div[@class='bar']"); if (node != null) { atr = node.Attributes["title"]; if (atr != null) { item.ConnectTime = atr.Value.Trim(); } } item.VerifyTime = nodes[9].InnerText.Trim(); list.Add(item); } } LogHelper.WriteAsyncLog($"当前页码{i},共{trs.Count}条数据,验证成功{count}条。"); } LogHelper.WriteAsyncLog($"结束解析,页码{start}~{end},当前页码{i}"); } }