public void Run() { log.Debug("所有ip正在删除中..........."); work: if (!WorkItemManager.CheckPassSpider())//****检查数据库是否有维护程序在执行******// { System.Threading.Thread.Sleep(60000); goto work; } ProxyIpManager.DeleteNotEffectiveProxyIp(); log.Debug("所有ip删除完成..........."); }
/// <summary> /// 导入IP /// </summary> /// <param name="ip"></param> /// <param name="ipArea"></param> /// <param name="existsObj"></param> /// <param name="message"></param> /// <returns></returns> public static int ImportProxyIp(string ip, string ipArea, out SysData_ProxyIp existsObj, out string message) { existsObj = null; message = ""; if (string.IsNullOrEmpty(ip)) { message = "ip不能为空"; return(0); } work: if (!WorkItemManager.CheckPassSpider())//****检查数据库是否有维护程序在执行******// { System.Threading.Thread.Sleep(60000); goto work; } existsObj = ProxyIpManager.GetProxyIpByIp(ip); if (existsObj != null) { message = "ip已存在"; return(0); } if (!CheckProxyIp(ip)) { message = "ip不可用"; return(0); } int result = ProxyIpManager.InsertProxyIp(ip, ipArea, out existsObj, out message, checkExists: false); if (result != 1) { return(0); } else { message = "ip插入成功"; } return(1); }
static void Main(string[] args) { string[] runType = args; if (args == null || args.Length < 1) { string paraStr = Console.ReadLine(); runType = paraStr.Split(' '); } if (runType[0].Contains("1")) { WorkItemManager.SetAllStop(1); Console.Write("爬取服务器暂停成功"); return; } else if (runType[0].Contains("0")) { WorkItemManager.SetAllStop(0); Console.Write("爬取服务器启动成功"); return; } work: if (!WorkItemManager.CheckPassSpider())//****检查数据库是否有维护程序在执行******// { System.Threading.Thread.Sleep(60000); goto work; } if (!runType[0].Contains("IP代理")) { #region (设置日志名称) Dictionary <string, List <string> > dic = new Dictionary <string, List <string> >(); //将参数添加到字典,用于生成日志文件名 foreach (string str in runType) { string strIndex1 = str.Split('.')[0]; string strIndex2 = str.Split('.')[1]; List <string> strList = new List <string>(); if (!dic.ContainsKey(strIndex1)) { dic.Add(strIndex1, new List <string>()); } strList = dic[strIndex1]; strList.Add(strIndex2); dic[strIndex1] = strList; } StringBuilder sb = new StringBuilder(); foreach (KeyValuePair <string, List <string> > kvp in dic) { string key = kvp.Key; List <string> list = kvp.Value; sb.Append(key).Append("_"); for (int i = 0; i < list.Count; i++) { string str = list[i]; if (i < list.Count - 1) { sb.Append(str).Append("."); } else { sb.Append(str).Append(";"); } } } InitLogger("(" + sb.ToString() + ")"); #endregion foreach (string str in runType) { //运行操作 INewDataRum rum = null; string directory = GetConfigDire(); string dllFill = directory + "\\FxtSpider.RunSource.dll"; Type type = Assembly.LoadFile(dllFill).GetType("FxtSpider.RunSource." + str); if (type != null) { rum = Activator.CreateInstance(type) as INewDataRum; } else { type = Assembly.LoadFile(dllFill).GetType("FxtSpider.RunSource." + str.Split('.')[0] + ".其他城市"); rum = Activator.CreateInstance(type) as INewDataRum; rum.CityName = str.Split('.')[1]; } rum.start(); } } else { #region (设置日志名称) Dictionary <string, List <string> > dic = new Dictionary <string, List <string> >(); StringBuilder sb = new StringBuilder(); //将参数添加到字典,用于生成日志文件名 foreach (string str in runType) { sb.Append(str).Append(";"); } InitLogger("(" + sb.ToString() + ")"); #endregion foreach (string str in runType) { string directory = GetConfigDire(); string dllFill = directory + "\\FxtSpider.RunSource.dll"; string className = str.Split('.')[0] + "." + str.Split('.')[1]; object objClass = Assembly.LoadFile(dllFill).CreateInstance("FxtSpider.RunSource." + className); if (objClass != null) { MethodInfo method = objClass.GetType().GetMethod(str.Split('.')[2]); ArrayList al = new ArrayList(); al.Add(method); al.Add(objClass); Thread m_thread = new Thread(new ParameterizedThreadStart(ExecIpSpider)); m_thread.Start(al); } } } }
/// <summary> /// 根据页面url,页面编码,正则表达式;获取相应字符串 /// </summary> /// <param name="url">页面url</param> /// <param name="encoding">页面编码</param> /// <param name="dic">正则表达式规则</param> /// <returns></returns> public static Dictionary <string, List <string> > GetHtmlByRegex(string url, string encoding, Dictionary <string, RegexInfo> dic, 网站表 webObj, int cityId, string referer = null, bool keepAlive = false, int timeout = 60000) { string NowProxyIp = null; int 网络异常重试次数 = 0; int 网络异常时代理ip更换次数 = 0; int 验证码异常时代理ip更换次数 = 0; string resultHtml = ""; string ipStr = NowProxyIp == null ? "" : NowProxyIp + ","; begin: Dictionary <string, List <string> > resultDic = new Dictionary <string, List <string> >(); if (dic == null) { return(resultDic); } try { resultHtml = GetHtml(url, encoding, proxyIp: NowProxyIp, referer: referer, keepAlive: keepAlive, timeout: timeout); } catch (Exception ex) { if (网络异常重试次数 < 2) { System.Threading.Thread.Sleep(3000); 网络异常重试次数++; goto begin; } if (webObj.BlockadeOfIP) { if (网络异常时代理ip更换次数 < 3) { System.Threading.Thread.Sleep(2000); 网络异常时代理ip更换次数++; work: if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行 { System.Threading.Thread.Sleep(60000); goto work; } ProxyIpManager.SetNotEffectiveProxyIp(webObj.ID, NowProxyIp); //设置当前代理ip为不可用 NowProxyIp = ProxyIpManager.GetEffectiveProxyIp(webObj.ID); //获取新代理ip ipStr = ipStr + NowProxyIp + ","; goto begin; } } log.Error(string.Format("SpiderHouse:(requestUrl:{0}--请求异常)", url), ex); //记录爬取失败原因和信息(网络异常) work2: if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行 { System.Threading.Thread.Sleep(60000); goto work2; } DatSpiderErrorLogManager.InsertError(cityId, webObj.ID, url, SysCodeManager.Code_1_1, "所用ip:" + ipStr); goto end; } end: resultHtml = Regex.Replace(resultHtml, @"(([\r\n])[\s]+|[\r\n]+|[\t]+)", "", RegexOptions.IgnoreCase); //检测是否需要验证码 bool checkError = false;//记录是否为验证码错误 if (webObj.BlockadeOfIPType == WebsiteManager.BlockadeOfIPType1) { List <string> checkList = GetStrByRegexByIndex(resultHtml, regex_checkcode); if (checkList != null && checkList.Count > 0) { 网络异常时代理ip更换次数 = 0; if (验证码异常时代理ip更换次数 < 3) { 验证码异常时代理ip更换次数++; //记录爬取失败原因和信息(网络异常) work3: if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行 { System.Threading.Thread.Sleep(60000); goto work3; } ProxyIpManager.SetNotEffectiveProxyIp(webObj.ID, NowProxyIp); //设置当前代理ip为不可用 NowProxyIp = ProxyIpManager.GetEffectiveProxyIp(webObj.ID); //获取新代理ip ipStr = ipStr + NowProxyIp + ","; goto begin; } else { checkError = true; //记录爬取时需要验证码 //记录爬取失败原因和信息(网络异常) work4: if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行 { System.Threading.Thread.Sleep(60000); goto work4; } DatSpiderErrorLogManager.InsertError(cityId, webObj.ID, url, SysCodeManager.Code_1_3, string.Format("所用ip:{0}", ipStr) ); } log.Debug(string.Format("SpiderHouse:(requestUrl:{0}--请求异常:需输入验证码)", url)); } } foreach (KeyValuePair <string, RegexInfo> kvp in dic) { string key = kvp.Key; List <string> list = GetStrByRegexByIndex(resultHtml, kvp.Value); resultDic.Add(key, list); if (webObj != null && webObj.ID == WebsiteManager.搜房网_ID && resultHtml.Contains(">此房源已售出!<") && resultHtml.Contains(">您可以选择查看:<")) { continue; } //正则表达式为重要字段&&无网络异常&&无验证码异常&&正则表达式不为null if (key.Contains("*") && !string.IsNullOrEmpty(resultHtml) && !checkError && kvp.Value != null && !string.IsNullOrEmpty(kvp.Value.RegexStr)) { //通过规则未获取到信息&&不为验证码异常 if (list == null || list.Count < 1) { //记录爬取失败原因和信息(通过规则未获取到字符) DatSpiderErrorLogManager.InsertError(cityId, webObj.ID, url, SysCodeManager.Code_1_2, string.Format("描述:{0},规则:{1},索引:{2},其他规则个数:{3},所用ip:{4}", key, kvp.Value.RegexStr, kvp.Value.RegexIndex, kvp.Value.RegexInfoList == null ? 0 : kvp.Value.RegexInfoList.Count, ipStr ) ); } } } if (webObj != null && webObj.ID == WebsiteManager.搜房网_ID && resultHtml.Contains(">此房源已售出!<") && resultHtml.Contains(">您可以选择查看:<")) { resultDic.Add("NotData", new List <string> { "1" }); } return(resultDic); }