/// <summary> /// 获取正确的代理ip /// </summary> /// <param name="Param">爬取参数</param> /// <returns>正确的代理ip</returns> public static string GetCorrectIP(ProxyParam Param) { string ProxyIp = string.Empty; string tempProxyIp = string.Empty; //当前页 int currentPage = 1; int PageSize = 100; while (string.IsNullOrEmpty(ProxyIp)) { //从数据库取 string strSQL = string.Format(@"SELECT IP,Port FROM ( SELECT IP,Port,ROW_NUMBER() OVER(ORDER BY Speed) AS Num FROM dbo.p_IPProxy WHERE Type='HTTP' AND ProxyIp NOT IN(SELECT ProxyIP FROM dbo.p_ProxyIPUseHistory WHERE Type='IpProxyJob' AND CreateDay=CONVERT(VARCHAR(10),GETDATE(),120)) ) AS A WHERE Num BETWEEN {0} AND {1}", (currentPage - 1) * PageSize + 1, currentPage * PageSize); DataTable dt = SQLHelper.FillDataTable(strSQL); if (dt == null || dt.Rows.Count == 0) { break; } foreach (DataRow dr in dt.Rows) { tempProxyIp = GetIP(dr["IP"].ToString(), dr["Port"].ToString()); TaskLog.IpProxyLogInfo.WriteLogE("当前IP:" + tempProxyIp); if (Ping(dr["IP"].ToString()) && GetTotalPage(Param.IPUrl, tempProxyIp) > 1) { ProxyIp = tempProxyIp; break; } } currentPage++; } if (string.IsNullOrEmpty(ProxyIp)) { ProxyIp = Param.DefaultProxyIp; } return(ProxyIp); }
public static List<IPProxy> ParseProxy(ProxyParam Param) { if (string.IsNullOrEmpty(Param.IPUrl)) { throw new ArgumentNullException("ParseProxy函数参数空异常"); } //总页数 int total = GetTotalPage(Param.IPUrl, Param.ProxyIp); //返回结果 List<IPProxy> list = new List<IPProxy>(); //多线程进行解析获取 List<Thread> listThread = new List<Thread>(); //每个线程需要解析的页面数量 int threadPqgeSize = (total / CPUCount) + 1; int count = 0; //为每个线程准备参数 List<Hashtable> threadParams = new List<Hashtable>(); int start, end; Hashtable table = null; for (int i = 0; i < CPUCount; i++) { start = i * threadPqgeSize + 1; if (i == CPUCount - 1) { end = total; } else { end = start + threadPqgeSize; } table = new Hashtable(); table.Add("start", start); table.Add("end", end); table.Add("list", list); table.Add("param", Param); threadParams.Add(table); count += threadPqgeSize; } for (int i = 1; i < CPUCount; i++) { Thread thread = new Thread(DoWork); thread.IsBackground = true; thread.Name = "PageParse #" + i.ToString(); listThread.Add(thread); thread.Start(threadParams[i]); } // 为当前线程指派生成任务。 DoWork(threadParams[0]); // 等待所有的编译线程执行线束。 foreach (Thread thread in listThread) { thread.Join(); } if (list.Count == 0) { TaskLog.IpProxyLogInfo.WriteLogE("爬虫-代理ip任务", new Exception("没有获取到数据,可能当前ip(" + Param.ProxyIp + ")已被服务器封锁")); } else { BatchSaveData(list); } return list; }
/// <summary> /// 获取正确的代理ip /// </summary> /// <param name="Param">爬取参数</param> /// <returns>正确的代理ip</returns> public static string GetCorrectIP(ProxyParam Param) { string ProxyIp = string.Empty; string tempProxyIp = string.Empty; //当前页 int currentPage = 1; int PageSize = 100; while (string.IsNullOrEmpty(ProxyIp)) { //从数据库取 string strSQL = string.Format(@"SELECT IP,Port FROM ( SELECT IP,Port,ROW_NUMBER() OVER(ORDER BY Speed) AS Num FROM dbo.p_IPProxy WHERE Type='HTTP' AND ProxyIp NOT IN(SELECT ProxyIP FROM dbo.p_ProxyIPUseHistory WHERE Type='IpProxyJob' AND CreateDay=CONVERT(VARCHAR(10),GETDATE(),120)) ) AS A WHERE Num BETWEEN {0} AND {1}", (currentPage - 1) * PageSize + 1, currentPage * PageSize); DataTable dt = SQLHelper.FillDataTable(strSQL); if (dt==null || dt.Rows.Count == 0) { break; } foreach (DataRow dr in dt.Rows) { tempProxyIp = GetIP(dr["IP"].ToString(), dr["Port"].ToString()); TaskLog.IpProxyLogInfo.WriteLogE("当前IP:" + tempProxyIp); if (Ping(dr["IP"].ToString()) && GetTotalPage(Param.IPUrl, tempProxyIp) > 1) { ProxyIp = tempProxyIp; break; } } currentPage++; } if (string.IsNullOrEmpty(ProxyIp)) { ProxyIp = Param.DefaultProxyIp; } return ProxyIp; }
public static List <IPProxy> ParseProxy(ProxyParam Param) { if (string.IsNullOrEmpty(Param.IPUrl)) { throw new ArgumentNullException("ParseProxy函数参数空异常"); } //总页数 int total = GetTotalPage(Param.IPUrl, Param.ProxyIp); //返回结果 List <IPProxy> list = new List <IPProxy>(); //多线程进行解析获取 List <Thread> listThread = new List <Thread>(); //每个线程需要解析的页面数量 int threadPqgeSize = (total / CPUCount) + 1; int count = 0; //为每个线程准备参数 List <Hashtable> threadParams = new List <Hashtable>(); int start, end; Hashtable table = null; for (int i = 0; i < CPUCount; i++) { start = i * threadPqgeSize + 1; if (i == CPUCount - 1) { end = total; } else { end = start + threadPqgeSize; } table = new Hashtable(); table.Add("start", start); table.Add("end", end); table.Add("list", list); table.Add("param", Param); threadParams.Add(table); count += threadPqgeSize; } for (int i = 1; i < CPUCount; i++) { Thread thread = new Thread(DoWork); thread.IsBackground = true; thread.Name = "PageParse #" + i.ToString(); listThread.Add(thread); thread.Start(threadParams[i]); } // 为当前线程指派生成任务。 DoWork(threadParams[0]); // 等待所有的编译线程执行线束。 foreach (Thread thread in listThread) { thread.Join(); } if (list.Count == 0) { TaskLog.IpProxyLogInfo.WriteLogE("爬虫-代理ip任务", new Exception("没有获取到数据,可能当前ip(" + Param.ProxyIp + ")已被服务器封锁")); } else { BatchSaveData(list); } return(list); }
/// <summary> /// 解析每一页数据 /// </summary> /// <param name="param"></param> private static void DoWork(object param) { //参数还原 Hashtable table = param as Hashtable; int start = Convert.ToInt32(table["start"]); int end = Convert.ToInt32(table["end"]); List <IPProxy> list = table["list"] as List <IPProxy>; ProxyParam Param = table["param"] as ProxyParam; //页面地址 string url = string.Empty; string ip = string.Empty; IPProxy item = null; HtmlNodeCollection nodes = null; HtmlNode node = null; HtmlAttribute atr = null; for (int i = start; i <= end; i++) { TaskLog.IpProxyLogInfo.WriteLogE(string.Format("开始解析,页码{0}~{1},当前页码{2}", start, end, i)); url = string.Format("{0}/{1}", Param.IPUrl, i); var doc = new HtmlDocument(); doc.LoadHtml(GetHTML(url, Param.ProxyIp)); //获取所有数据节点tr var trs = doc.DocumentNode.SelectNodes(@"//table[@id='ip_list']/tr"); if (trs != null && trs.Count > 1) { TaskLog.IpProxyLogInfo.WriteLogE(string.Format("当前页码{0},请求地址{1},共{2}条数据", i, url, trs.Count)); for (int j = 1; j < trs.Count; j++) { nodes = trs[j].SelectNodes("td"); if (nodes != null && nodes.Count > 9) { ip = nodes[2].InnerText.Trim(); if (Param.IsPingIp && !Ping(ip)) { continue; } //有效的IP才添加 item = new IPProxy(); node = nodes[1].FirstChild; if (node != null) { atr = node.Attributes["alt"]; if (atr != null) { item.Country = atr.Value.Trim(); } } item.IP = ip; item.Port = nodes[3].InnerText.Trim(); item.ProxyIp = GetIP(item.IP, item.Port); item.Position = nodes[4].InnerText.Trim(); item.Anonymity = nodes[5].InnerText.Trim(); item.Type = nodes[6].InnerText.Trim(); node = nodes[7].SelectSingleNode("div[@class='bar']"); if (node != null) { atr = node.Attributes["title"]; if (atr != null) { item.Speed = atr.Value.Trim(); } } node = nodes[8].SelectSingleNode("div[@class='bar']"); if (node != null) { atr = node.Attributes["title"]; if (atr != null) { item.ConnectTime = atr.Value.Trim(); } } item.VerifyTime = nodes[9].InnerText.Trim(); list.Add(item); } } TaskLog.IpProxyLogInfo.WriteLogE(string.Format("当前页码{0},共{1}条数据", i, trs.Count)); } TaskLog.IpProxyLogInfo.WriteLogE(string.Format("结束解析,页码{0}~{1},当前页码{2}", start, end, i)); } }