public void Run() { List <string> orderList = new List <string>(); string configPath = AppDomain.CurrentDomain.BaseDirectory + "IP代理/Config/TaobaoProxyIpConfig.txt"; StreamReader sr = new StreamReader(configPath); while (true) { string str = sr.ReadLine(); if (str == null) { break; } else { orderList.Add(str); } } sr.Close(); sr.Dispose(); string url = "http://121.199.38.28/ip/?tid={0}&num=1&ports=80,808,3128&filter=on"; string error = "ERROR|订单剩余数量不足"; SysData_ProxyIp existsObj = new SysData_ProxyIp(); string message = ""; foreach (string orderStr in orderList) { string requestUrl = string.Format(url, orderStr); while (true) { string ipHtml = ""; begin: try { ipHtml = SpiderHelp.GetHtml(requestUrl, "utf-8"); } catch (Exception ex) { System.Threading.Thread.Sleep(3000); goto begin; } if (ipHtml.Contains(error)) { break; } int result = ProxyIpHelp.ImportProxyIp(ipHtml, "匿名", out existsObj, out message); if (result != 1) { log.Debug(string.Format("{0},订单:{1},url:{2},ip:{3}", message, orderStr, url, ipHtml == null ? "null" : ipHtml)); continue; } else { log.Debug(string.Format("ip插入成功,订单:{0},url:{1},ip:{2}", orderStr, url, ipHtml == null ? "null" : ipHtml)); } } log.Debug(string.Format("订单:{0}导入完成,url:{1}", orderStr, url)); } log.Debug("所有订单:导入完成,url:{1}"); }
public void RunPageList(object url) { string _url = Convert.ToString(url); string pageUrlPara = url + "&pageid={0}"; try { SysData_ProxyIp existsObj = new SysData_ProxyIp(); bool notStop = true; string message = ""; int pageIndex = 1; while (notStop) { string paegUrl = string.Format(pageUrlPara, pageIndex); int reqCount = 1; reqBegin: Dictionary <string, List <string> > dicValueList = SpiderHelp.GetHtmlByRegexNotProxyIp(paegUrl, "gb2312", regexDic); List <string> ipInfoList = dicValueList["regex_ipInfo"];//所有ip集合 if (ipInfoList == null || ipInfoList.Count < 1) { if (reqCount < 3) { reqCount = reqCount + 1; goto reqBegin; } else { pageIndex = 1; log.Debug(string.Format("未获取到IP列表,url:{0}", url)); continue; } } foreach (string ipInfo in ipInfoList) { Dictionary <string, List <string> > infoListDic = SpiderHelp.GetStrByRegex(ipInfo, regexDic); string ip = infoListDic["regex_ip"].Count < 1 ? "" : infoListDic["regex_ip"][0]; ip = ip.TrimBlank(); int result = ProxyIpHelp.ImportProxyIp(ip, "匿名", out existsObj, out message); if (result != 1) { log.Debug(string.Format("{0},url:{1},ip:{2}", message, url, ip == null ? "null" : ip)); continue; } else { log.Debug(string.Format("ip插入成功,url:{0},ip:{1}", url, ip == null ? "null" : ip)); } } pageIndex = pageIndex + 1; } } catch (Exception ex) { log.Error("系统异常", ex); } }
public void Run() { //ip列表页ip信息 RegexInfo regex_ipInfo = new RegexInfo("([^<>]+\\@[^<>]+</font>)", "$1"); //ip列表页ip RegexInfo regex_ip = new RegexInfo("([^<>]+)\\@[^<>]+</font>", "$1"); //ip列表页ip区域 RegexInfo regex_area = new RegexInfo("[^<>]+\\#([^<>]+)</font>", "$1"); Dictionary <string, RegexInfo> regexDic1 = new Dictionary <string, RegexInfo>(); regexDic1.Add("regex_ipInfo", regex_ipInfo); Dictionary <string, RegexInfo> regexDic2 = new Dictionary <string, RegexInfo>(); regexDic2.Add("regex_ip", regex_ip); regexDic2.Add("regex_area", regex_area); //从文本文件中获取爬取ip分类页面列表 List <string> urlList = new List <string>(); string configPath = AppDomain.CurrentDomain.BaseDirectory + "IP代理/Config/Lastro网IP代理ProxyIpConfig.txt"; StreamReader sr = new StreamReader(configPath); while (true) { string str = sr.ReadLine(); if (str == null) { break; } else { urlList.Add(str); } } sr.Close(); sr.Dispose(); //但从当前页面中获取爬取ip分类列表 RegexInfo regex_urllist = new RegexInfo("<a href=\"([^\"]+)\"[^<>]*>【国内代理】[^<>]+</a>", "$1"); RegexInfo regex_urllist_pagecount = new RegexInfo("<span title=\"共[^\"]+页\"> /([^<>]+)页", "$1"); Dictionary <string, RegexInfo> regexDic3 = new Dictionary <string, RegexInfo>(); regexDic3.Add("regex_urllist", regex_urllist); regexDic3.Add("regex_urllist_pagecount", regex_urllist_pagecount); int urllist_index = 1; int urllist_max_index = 1; string urllist_nextPage_para = "http://www.httpip.net/forum-36-{0}.html"; //下一页 string urllist_nextPage = string.Format(urllist_nextPage_para, urllist_index); //下一页 try { begin_nextpage: Dictionary <string, List <string> > dicValueList3 = SpiderHelp.GetHtmlByRegexNotProxyIp(urllist_nextPage, "gb2312", regexDic3); List <string> urlList2 = dicValueList3["regex_urllist"];//所有ip列表集合 urlList.AddRange(urlList2); urllist_max_index = dicValueList3["regex_urllist_pagecount"].Count < 1 ? 0 : Convert.ToInt32(dicValueList3["regex_urllist_pagecount"][0].TrimBlank()); if (urllist_index < urllist_max_index) { urllist_index = urllist_index + 1; urllist_nextPage = string.Format(urllist_nextPage_para, urllist_index); } else { urllist_nextPage = ""; } //开始爬取当前页列表ip页面 SysData_ProxyIp existsObj = new SysData_ProxyIp(); string message = ""; for (int i = 0; i < urlList.Count(); i++) { string urlInfo = urlList[i]; string url = urlInfo.Split('$')[0]; string urlHost = urlInfo.Split('$').Length < 2 ? "http://www.httpip.net/" : urlInfo.Split('$')[1]; requestpage: if (!url.ToLower().Contains("http://")) { url = urlHost + url; } Dictionary <string, List <string> > dicValueList = SpiderHelp.GetHtmlByRegexNotProxyIp(url, "gb2312", regexDic1); List <string> ipInfoList = dicValueList["regex_ipInfo"]; //所有ip集合 string nextPage = ""; //下一页链接 if (ipInfoList == null || ipInfoList.Count < 1) { log.Debug(string.Format("未获取到IP列表,url:{0}", url)); continue; } foreach (string ipInfo in ipInfoList) { Dictionary <string, List <string> > infoListDic = SpiderHelp.GetStrByRegex(ipInfo, regexDic2); string ip = infoListDic["regex_ip"].Count < 1 ? "" : infoListDic["regex_ip"][0]; string ipArea = infoListDic["regex_area"].Count < 1 ? "" : infoListDic["regex_area"][0]; ip = ip.TrimBlank(); int result = ProxyIpHelp.ImportProxyIp(ip, ipArea, out existsObj, out message); if (result != 1) { log.Debug(string.Format("{0},url:{1},ip:{2}", message, url, ip == null ? "null" : ip)); continue; } else { log.Debug(string.Format("ip插入成功,url:{0},ip:{1}", url, ip == null ? "null" : ip)); } } if (!string.IsNullOrEmpty(nextPage)) { if (!nextPage.ToLower().Contains("http://")) { url = urlHost + nextPage; } goto requestpage; } } if (!string.IsNullOrEmpty(urllist_nextPage)) { urlList = new List <string>(); goto begin_nextpage; } } catch (Exception ex) { log.Error("系统异常", ex); } }
public void Run() { //ip列表页ip信息 RegexInfo regex_ipInfo = new RegexInfo("(\\ \\;[^<>]+<br />)", "$1"); //ip列表页ip RegexInfo regex_ip = new RegexInfo("\\ \\;([^<>\\s]+) ([^<>\\s]+) [^<>]+<br />", "$1:$2"); //ip列表页ip区域 RegexInfo regex_area = new RegexInfo("\\ \\;[^<>\\s]+ [^<>\\s]+ ([^<>\\s]+) [^<>]+<br />", "$1"); Dictionary <string, RegexInfo> regexDic1 = new Dictionary <string, RegexInfo>(); regexDic1.Add("regex_ipInfo", regex_ipInfo); Dictionary <string, RegexInfo> regexDic2 = new Dictionary <string, RegexInfo>(); regexDic2.Add("regex_ip", regex_ip); regexDic2.Add("regex_area", regex_area); //从文本文件中获取爬取ip分类页面列表 List <string> urlList = new List <string>(); string configPath = AppDomain.CurrentDomain.BaseDirectory + "IP代理/Config/猫扑网IP代理ProxyIpConfig.txt"; StreamReader sr = new StreamReader(configPath); while (true) { string str = sr.ReadLine(); if (str == null) { break; } else { urlList.Add(str); } } sr.Close(); sr.Dispose(); //但从当前页面中获取爬取ip分类列表 RegexInfo regex_urllist = new RegexInfo("<DT><a href=\"([^\"]+)\" target=\"_blank\">[^<>]+</a></DT>", "$1"); RegexInfo regex_urllist_pagecount = new RegexInfo("<a href=\"http://www.itmop.com/proxy/catalog.asp\\?page=(\\d+)\">\\»\\;</a></DIV>", "$1"); Dictionary <string, RegexInfo> regexDic3 = new Dictionary <string, RegexInfo>(); regexDic3.Add("regex_urllist", regex_urllist); regexDic3.Add("regex_urllist_pagecount", regex_urllist_pagecount); int urllist_index = 1; int urllist_max_index = 1; string urllist_nextPage_para = "http://www.itmop.com/proxy/catalog.asp?page={0}"; //下一页 string urllist_nextPage = string.Format(urllist_nextPage_para, urllist_index); //下一页 try { begin_nextpage: //获取ip列表页url Dictionary <string, List <string> > dicValueList3 = SpiderHelp.GetHtmlByRegexNotProxyIp(urllist_nextPage, "utf-8", regexDic3); List <string> urlList2 = dicValueList3["regex_urllist"];//所有ip列表集合 urlList.AddRange(urlList2); //获取ip列表列表页的最大页数 urllist_max_index = dicValueList3["regex_urllist_pagecount"].Count < 1 ? 0 : Convert.ToInt32(dicValueList3["regex_urllist_pagecount"][0].TrimBlank()); if (urllist_index < urllist_max_index) { urllist_index = urllist_index + 1; urllist_nextPage = string.Format(urllist_nextPage_para, urllist_index); } else { urllist_nextPage = ""; } //开始爬取当前页列表ip页面 SysData_ProxyIp existsObj = new SysData_ProxyIp(); string message = ""; for (int i = 0; i < urlList.Count(); i++) { string urlInfo = urlList[i]; urlInfo = urlInfo.Replace("&", "&"); string url = urlInfo.Split('$')[0]; string urlHost = urlInfo.Split('$').Length < 2 ? "http://www.itmop.com" : urlInfo.Split('$')[1]; requestpage: //根据ip列表页url爬取ip信息 if (!url.ToLower().Contains("http://")) { url = urlHost + url; } Dictionary <string, List <string> > dicValueList = SpiderHelp.GetHtmlByRegexNotProxyIp(url, "utf-8", regexDic1); List <string> ipInfoList = dicValueList["regex_ipInfo"];//所有ip集合 string nextPage = ""; if (ipInfoList == null || ipInfoList.Count < 1) { log.Debug(string.Format("未获取到IP列表,url:{0}", url)); continue; } foreach (string ipInfo in ipInfoList) { Dictionary <string, List <string> > infoListDic = SpiderHelp.GetStrByRegex(ipInfo, regexDic2); string ip = infoListDic["regex_ip"].Count < 1 ? "" : infoListDic["regex_ip"][0]; string ipArea = infoListDic["regex_area"].Count < 1 ? "" : infoListDic["regex_area"][0]; ipArea = ipArea.RemoveHeml(); ip = ip.TrimBlank(); int result = ProxyIpHelp.ImportProxyIp(ip, ipArea, out existsObj, out message); if (result != 1) { log.Debug(string.Format("{0},url:{1},ip:{2}", message, url, ip == null ? "null" : ip)); continue; } else { log.Debug(string.Format("ip插入成功,url:{0},ip:{1}", url, ip == null ? "null" : ip)); } } if (!string.IsNullOrEmpty(nextPage)) { url = urlHost + nextPage; goto requestpage; } } if (!string.IsNullOrEmpty(urllist_nextPage)) { urlList = new List <string>(); goto begin_nextpage; } } catch (Exception ex) { log.Error("系统异常", ex); } }
public void ImportIp(object param) { string fileName = Convert.ToString(param); int count = 0;//成功个数 //禁用其他按钮 panel查询条件.Enabled = false; btnExcel2.Enabled = false; btnImportIp.Visible = false; //进度条位置设置 panelImport.Visible = true; panelImport.Top = btnImportIp.Top; panelImport.Left = btnImportIp.Left; labelImportBar.Text = "0%"; progressBarImportIp.Value = 0; SysData_ProxyIp existsObj = new SysData_ProxyIp(); string message = ""; object missing = System.Reflection.Missing.Value; Excel.Application excel = new Excel.Application(); if (excel == null) { MessageBox.Show("Can't access excel"); } else { excel.Visible = false; excel.UserControl = true; // 以只读的形式打开EXCEL文件 Excel.Workbook wb = excel.Application.Workbooks.Open(fileName, missing, true, missing, missing, missing, missing, missing, missing, true, missing, missing, missing, missing, missing); //取得第一个工作薄 Excel.Worksheet ws = (Excel.Worksheet)wb.Worksheets.get_Item(1); //取得总记录行数 int rowsint = ws.UsedRange.Cells.Rows.Count; //得到行数 //设置进度条范围 progressBarImportIp.Maximum = rowsint; progressBarImportIp.Minimum = 0; //取得数据范围区域 (从第一行第一列到最后一行第二列) Excel.Range rng1 = ws.Cells.get_Range("A1", "B" + rowsint); //item object[,] arryItem = (object[, ])rng1.Value2; for (int i = 1; i <= rowsint - 1; i++) { string ip = arryItem[i, 1].ToString(); string ipArea = arryItem[i, 2].ToString(); int result = ProxyIpHelp.ImportProxyIp(ip, ipArea, out existsObj, out message); if (result == 1) { count = count + 1; } progressBarImportIp.Value = progressBarImportIp.Value + 1; labelImportBar.Text = 计算百分比(progressBarImportIp.Value, progressBarImportIp.Maximum); } progressBarImportIp.Value = progressBarImportIp.Maximum; labelImportBar.Text = 计算百分比(progressBarImportIp.Value, progressBarImportIp.Maximum); } excel.Quit(); excel = null; Process[] procs = Process.GetProcessesByName("excel"); foreach (Process pro in procs) { pro.Kill();//没有更好的方法,只有杀掉进程 } GC.Collect(); MessageBox.Show(string.Format("导入完成,成功导入可用的代理IP{0}个", count)); //进度条设置 panelImport.Visible = false; btnExcel2.Enabled = true; btnImportIp.Visible = true; panel查询条件.Enabled = true; labelImportBar.Text = ""; }
public void Run() { RegexInfo regex_ipInfo = new RegexInfo("(<tr class=\"[^\"]*\">(?:(?!</tr>).)*</tr>)", "$1"); RegexInfo regex_nextPage = new RegexInfo("<a class=\"next_page\" rel=\"next\" href=\"([^\"]+)\">下一页[^<>]*</a>", "$1"); RegexInfo regex_ip = new RegexInfo("<tr class=\"[^\"]*\"><td>(?:(?!</td>).)*</td><td>([^<>]+)</td><td>([^<>]+)</td><td><a href=\"[^\"]*\">[^<>]+</a></td>" + "<td>[^<>]*</td><td>[^<>]*</td><td>(?:(?!</td>).)*</td><td>(?:(?!</td>).)*</td><td>[^<>]+</td>", "$1:$2"); RegexInfo regex_area = new RegexInfo("<tr class=\"[^\"]*\"><td>(?:(?!</td>).)*</td><td>[^<>]+</td><td>[^<>]+</td><td><a href=\"[^\"]*\">([^<>]+)</a></td>" + "<td>[^<>]*</td><td>[^<>]*</td><td>(?:(?!</td>).)*</td><td>(?:(?!</td>).)*</td><td>[^<>]+</td>", "$1"); RegexInfo regex_date = new RegexInfo("<tr class=\"[^\"]*\"><td>(?:(?!</td>).)*</td><td>[^<>]+</td><td>[^<>]+</td><td><a href=\"[^\"]*\">[^<>]+</a></td>" + "<td>[^<>]*</td><td>[^<>]*</td><td>(?:(?!</td>).)*</td><td>(?:(?!</td>).)*</td><td>([^<>]+)</td>", "$1"); Dictionary <string, RegexInfo> regexDic1 = new Dictionary <string, RegexInfo>(); regexDic1.Add("regex_ipInfo", regex_ipInfo); regexDic1.Add("regex_nextPage", regex_nextPage); Dictionary <string, RegexInfo> regexDic2 = new Dictionary <string, RegexInfo>(); regexDic2.Add("regex_ip", regex_ip); regexDic2.Add("regex_area", regex_area); regexDic2.Add("regex_date", regex_date); //从文本文件中获取爬取配置 List <string> urlList = new List <string>(); string configPath = AppDomain.CurrentDomain.BaseDirectory + "IP代理/Config/西刺ProxyIpConfig.txt"; StreamReader sr = new StreamReader(configPath); while (true) { string str = sr.ReadLine(); if (str == null) { break; } else { urlList.Add(str); } } sr.Close(); sr.Dispose(); //urlList.Add("http://www.xici.net.co/nt/$http://www.xici.net.co/$2014-5-13"); try { SysData_ProxyIp existsObj = new SysData_ProxyIp(); string message = ""; foreach (string urlInfo in urlList) { string url = urlInfo.Split('$')[0]; string urlHost = urlInfo.Split('$')[1]; DateTime maxDate = Convert.ToDateTime(urlInfo.Split('$')[2]); requestpage: Dictionary <string, List <string> > dicValueList = SpiderHelp.GetHtmlByRegexNotProxyIp(url, "utf-8", regexDic1); List <string> ipInfoList = dicValueList["regex_ipInfo"]; //所有ip集合 string nextPage = dicValueList["regex_nextPage"].Count < 1 ? "" : dicValueList["regex_nextPage"][0]; //下一页链接 if (ipInfoList == null || ipInfoList.Count < 1) { log.Debug(string.Format("未获取到IP列表,url:{0}", url)); continue; } foreach (string ipInfo in ipInfoList) { Dictionary <string, List <string> > infoListDic = SpiderHelp.GetStrByRegex(ipInfo, regexDic2); string ip = infoListDic["regex_ip"].Count < 1 ? "" : infoListDic["regex_ip"][0]; string ipArea = infoListDic["regex_area"].Count < 1 ? "" : infoListDic["regex_area"][0]; DateTime ipDate = infoListDic["regex_date"].Count < 1 ? DateTime.Now : Convert.ToDateTime(infoListDic["regex_date"][0]); ip = ip.TrimBlank(); int result = ProxyIpHelp.ImportProxyIp(ip, ipArea, out existsObj, out message); if (result != 1) { log.Debug(string.Format("{0},url:{1},ip:{2}", message, url, ip == null ? "null" : ip)); continue; } else { log.Debug(string.Format("ip插入成功,url:{0},ip:{1}", url, ip == null ? "null" : ip)); } //如果自定日期 if (ipDate < maxDate) { nextPage = null; break; } } if (!string.IsNullOrEmpty(nextPage)) { url = urlHost + nextPage; goto requestpage; } } } catch (Exception ex) { log.Error("系统异常", ex); } }