private void 绑定所有网站() { List <网站表> list = new List <网站表>();// WebsiteManager.所有网站; 网站表 obj = new 网站表 { ID = -1, 网站名称 = "全部" }; list.Insert(0, obj); cb网站.DisplayMember = "网站名称"; cb网站.ValueMember = "ID"; //cb网站.DataSource = list; }
/// <summary> /// 根据网站名称获取ID /// </summary> /// <param name="webName"></param> /// <returns></returns> public static int GetWebIdByWebName(string webName) { int webId = 0; using (DataClassesDataContext db = new DataClassesDataContext()) { 网站表 website = 所有网站.Find(delegate(网站表 _website) { return(!string.IsNullOrEmpty(webName) && webName.Equals(_website.网站名称)); }); if (website != null) { webId = website.ID; } } return(webId); }
private void cboxCity_SelectedIndexChanged(object sender, EventArgs e) { string selectCityId = Convert.ToString(cboxCity.SelectedValue); if (!string.IsNullOrEmpty(selectCityId) && StringHelp.IsInteger(selectCityId)) { List <网站表> list = WebsiteManager.GetWebByCityId(Convert.ToInt32(selectCityId)); 网站表 obj = new 网站表 { ID = -1, 网站名称 = "全部" }; list.Insert(0, obj); cb网站.DataSource = list; } }
public void start() { 网站表 webObj = WebsiteManager.GetWebById(WebsiteManager.城市房产_ID); RegexInfo 总页数正则 = new RegexInfo("<div class=\"[^\"]*\"><span class='fl mr'>\\d+/(\\d+)</span>", "$1"); Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>(); 根页面正则字典集合.Add("总页数", 总页数正则); RegexInfo cityRegexInfo = new RegexInfo("(<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>[^<>]+</a>)", "$1"); //cityRegexInfo.RegexInfoList.Add(new RegexInfo("(<a[^<>]+href='http\\://[^\\.]+.cityhouse.cn'[^<>]*>[^<>]+</a>)", "$1")); Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>(); cityRegexDic.Add("城市列表", cityRegexInfo); Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.cityhouse.cn/city.html", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.城市房产_ID), CityId); List <string> cityList = dicCitylistText["城市列表"]; StringBuilder citySb = new StringBuilder(); StringBuilder citySb2 = new StringBuilder(); foreach (string cityInfoStr in cityList) { RegexInfo regexCityName = new RegexInfo("<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>([^<>]+)</a>", "$1"); RegexInfo regexCityUrl = new RegexInfo("<a[^<>]+href=[\",']{1,1}(http\\://[^\\.]+.cityhouse.cn)[\",']{1,1}[^<>]*>[^<>]+</a>", "$1"); Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>(); cityRegexDic2.Add("regexCityName", regexCityName); cityRegexDic2.Add("regexCityUrl", regexCityUrl); Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2); string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : ""); string cityUrl = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : ""; string execStr = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}"; 城市表 city = CityManager.Get城市_byLike城市名称(cityName); if (city != null && !citySb2.ToString().Contains(city.城市名称) && !citySb.ToString().Contains(city.城市名称) && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.城市房产_ID) == null) { Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(cityUrl + "/forsale/flist.html?ob=10", "utf-8", 根页面正则字典集合, webObj, CityId, referer: cityUrl + "/forsale/flist.html?ob=10"); execStr = string.Format(execStr, WebsiteManager.城市房产, city.城市名称, cityUrl, cityUrl + "/forsale/flist.html?ob=10", "", "2000", "2000"); if (根页面正则字典集合结果["总页数"].Count() < 1) { citySb2.Append(execStr).Append("\r\n"); continue; } citySb.Append(execStr).Append("\r\n"); } } string result = citySb.ToString(); string result2 = citySb2.ToString(); 导出任务计划配置文件(); }
static WebsiteManager() { 所有网站 = GetAllWebsite(); 搜房网_ID = 0; 安居客_ID = 0; 新浪二手房_ID = 0; 黄石信息港_ID = 0; 住在九江_ID = 0; 城市房产_ID = 0; 河源置业网_ID = 0; 邯郸恋家网_ID = 0; 常州房产网_ID = 0; 楼盘网_ID = 0; 搜狐二手房_ID = 0; 满堂红地产网_ID = 0; 置家网_ID = 0; 中国房产超市_ID = 0; 中原地产_ID = 0; 网站表 obj1 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(搜房网)); }); if (obj1 != null) { 搜房网_ID = obj1.ID; } 网站表 obj2 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(安居客)); }); if (obj2 != null) { 安居客_ID = obj2.ID; } 网站表 obj3 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(新浪二手房)); }); if (obj3 != null) { 新浪二手房_ID = obj3.ID; } 网站表 obj4 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(赶集网)); }); if (obj4 != null) { 赶集网_ID = obj4.ID; } 网站表 obj5 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(五八同城)); }); if (obj5 != null) { 五八同城_ID = obj5.ID; } 网站表 obj6 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(黄石信息港)); }); if (obj6 != null) { 黄石信息港_ID = obj6.ID; } 网站表 obj7 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(住在九江)); }); if (obj7 != null) { 住在九江_ID = obj7.ID; } 网站表 obj8 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(城市房产)); }); if (obj8 != null) { 城市房产_ID = obj8.ID; } 网站表 obj9 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(河源置业网)); }); if (obj9 != null) { 河源置业网_ID = obj9.ID; } 网站表 obj10 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(邯郸恋家网)); }); if (obj10 != null) { 邯郸恋家网_ID = obj10.ID; } 网站表 obj11 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(常州房产网)); }); if (obj11 != null) { 常州房产网_ID = obj11.ID; } 网站表 obj12 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(楼盘网)); }); if (obj12 != null) { 楼盘网_ID = obj12.ID; } 网站表 obj13 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(搜狐二手房)); }); if (obj13 != null) { 搜狐二手房_ID = obj13.ID; } 网站表 obj14 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(满堂红地产网)); }); if (obj14 != null) { 满堂红地产网_ID = obj14.ID; } 网站表 obj15 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(置家网)); }); if (obj15 != null) { 置家网_ID = obj15.ID; } 网站表 obj16 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(中国房产超市)); }); if (obj16 != null) { 中国房产超市_ID = obj16.ID; } 网站表 obj17 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(中原地产)); }); if (obj17 != null) { 中原地产_ID = obj17.ID; } }
public static 网站表 GetWebById(int id) { 网站表 web = 所有网站.Find(delegate(网站表 _web) { return(_web.ID == id); }); return(web); }
/// <summary> /// 根据页面url,页面编码,正则表达式;获取相应字符串 /// </summary> /// <param name="url">页面url</param> /// <param name="encoding">页面编码</param> /// <param name="dic">正则表达式规则</param> /// <returns></returns> public static Dictionary <string, List <string> > GetHtmlByRegex(string url, string encoding, Dictionary <string, RegexInfo> dic, 网站表 webObj, int cityId, string referer = null, bool keepAlive = false, int timeout = 60000) { string NowProxyIp = null; int 网络异常重试次数 = 0; int 网络异常时代理ip更换次数 = 0; int 验证码异常时代理ip更换次数 = 0; string resultHtml = ""; string ipStr = NowProxyIp == null ? "" : NowProxyIp + ","; begin: Dictionary <string, List <string> > resultDic = new Dictionary <string, List <string> >(); if (dic == null) { return(resultDic); } try { resultHtml = GetHtml(url, encoding, proxyIp: NowProxyIp, referer: referer, keepAlive: keepAlive, timeout: timeout); } catch (Exception ex) { if (网络异常重试次数 < 2) { System.Threading.Thread.Sleep(3000); 网络异常重试次数++; goto begin; } if (webObj.BlockadeOfIP) { if (网络异常时代理ip更换次数 < 3) { System.Threading.Thread.Sleep(2000); 网络异常时代理ip更换次数++; work: if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行 { System.Threading.Thread.Sleep(60000); goto work; } ProxyIpManager.SetNotEffectiveProxyIp(webObj.ID, NowProxyIp); //设置当前代理ip为不可用 NowProxyIp = ProxyIpManager.GetEffectiveProxyIp(webObj.ID); //获取新代理ip ipStr = ipStr + NowProxyIp + ","; goto begin; } } log.Error(string.Format("SpiderHouse:(requestUrl:{0}--请求异常)", url), ex); //记录爬取失败原因和信息(网络异常) work2: if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行 { System.Threading.Thread.Sleep(60000); goto work2; } DatSpiderErrorLogManager.InsertError(cityId, webObj.ID, url, SysCodeManager.Code_1_1, "所用ip:" + ipStr); goto end; } end: resultHtml = Regex.Replace(resultHtml, @"(([\r\n])[\s]+|[\r\n]+|[\t]+)", "", RegexOptions.IgnoreCase); //检测是否需要验证码 bool checkError = false;//记录是否为验证码错误 if (webObj.BlockadeOfIPType == WebsiteManager.BlockadeOfIPType1) { List <string> checkList = GetStrByRegexByIndex(resultHtml, regex_checkcode); if (checkList != null && checkList.Count > 0) { 网络异常时代理ip更换次数 = 0; if (验证码异常时代理ip更换次数 < 3) { 验证码异常时代理ip更换次数++; //记录爬取失败原因和信息(网络异常) work3: if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行 { System.Threading.Thread.Sleep(60000); goto work3; } ProxyIpManager.SetNotEffectiveProxyIp(webObj.ID, NowProxyIp); //设置当前代理ip为不可用 NowProxyIp = ProxyIpManager.GetEffectiveProxyIp(webObj.ID); //获取新代理ip ipStr = ipStr + NowProxyIp + ","; goto begin; } else { checkError = true; //记录爬取时需要验证码 //记录爬取失败原因和信息(网络异常) work4: if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行 { System.Threading.Thread.Sleep(60000); goto work4; } DatSpiderErrorLogManager.InsertError(cityId, webObj.ID, url, SysCodeManager.Code_1_3, string.Format("所用ip:{0}", ipStr) ); } log.Debug(string.Format("SpiderHouse:(requestUrl:{0}--请求异常:需输入验证码)", url)); } } foreach (KeyValuePair <string, RegexInfo> kvp in dic) { string key = kvp.Key; List <string> list = GetStrByRegexByIndex(resultHtml, kvp.Value); resultDic.Add(key, list); if (webObj != null && webObj.ID == WebsiteManager.搜房网_ID && resultHtml.Contains(">此房源已售出!<") && resultHtml.Contains(">您可以选择查看:<")) { continue; } //正则表达式为重要字段&&无网络异常&&无验证码异常&&正则表达式不为null if (key.Contains("*") && !string.IsNullOrEmpty(resultHtml) && !checkError && kvp.Value != null && !string.IsNullOrEmpty(kvp.Value.RegexStr)) { //通过规则未获取到信息&&不为验证码异常 if (list == null || list.Count < 1) { //记录爬取失败原因和信息(通过规则未获取到字符) DatSpiderErrorLogManager.InsertError(cityId, webObj.ID, url, SysCodeManager.Code_1_2, string.Format("描述:{0},规则:{1},索引:{2},其他规则个数:{3},所用ip:{4}", key, kvp.Value.RegexStr, kvp.Value.RegexIndex, kvp.Value.RegexInfoList == null ? 0 : kvp.Value.RegexInfoList.Count, ipStr ) ); } } } if (webObj != null && webObj.ID == WebsiteManager.搜房网_ID && resultHtml.Contains(">此房源已售出!<") && resultHtml.Contains(">您可以选择查看:<")) { resultDic.Add("NotData", new List <string> { "1" }); } return(resultDic); }
public void start() { string[] citys = new string[] { "济宁", "杭州", "漳州", "威海", "宜昌", "北海", "包头", "滨州", "长春", "大连", "东营", "衡水", "湖州", "金华", "九江", "盐城", "嘉兴", "聊城", "芜湖", "临沂", "南通", "秦皇岛", "衢州", "日照", "上海", "石家庄", "绍兴", "泰安", "潍坊", "襄阳", "银川", "烟台", "淄博", "镇江" }; 网站表 webObj = WebsiteManager.GetWebById(WebsiteManager.中国房产超市_ID); RegexInfo 总页数正则 = new RegexInfo(">下一页</a><div[^<>]*><span[^<>]*>[^<>]*</span>/([^<>]+)<", "$1"); Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>(); 根页面正则字典集合.Add("总页数", 总页数正则); RegexInfo cityRegexListText = new RegexInfo("<span class=\"hui6c\"[^<>]*>((?:(?!</span).)*)</span>", "$1"); RegexInfo cityRegexInfo = new RegexInfo("(<a [^<>]+>[^<>]+</a>)", "$1"); Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>(); cityRegexDic.Add("城市列表文本", cityRegexListText); //获取所有城市的文本区域 Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.fccs.com/city/", "gb2312", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.中国房产超市_ID), CityId); string cityText = dicCitylistText["城市列表文本"].Count < 1 ? "" : dicCitylistText["城市列表文本"][0]; //从文本区域中提取城市列表 cityRegexDic.Add("城市列表", cityRegexInfo); dicCitylistText = SpiderHelp.GetStrByRegex(cityText, cityRegexDic); List <string> cityList = dicCitylistText["城市列表"]; StringBuilder citySb = new StringBuilder(); StringBuilder citySb2 = new StringBuilder(); StringBuilder citySb3 = new StringBuilder(); foreach (string cityInfoStr in cityList) { //获取城市名称+url RegexInfo regexCityName = new RegexInfo("<a [^<>]+>([^<>]+)</a>", "$1"); RegexInfo regexCityUrl = new RegexInfo("<a[^<>]+href=\"([^<>]+)\"[^<>]*>[^<>]+</a>", "$1"); Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>(); cityRegexDic2.Add("regexCityName", regexCityName); cityRegexDic2.Add("regexCityUrl", regexCityUrl); Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2); string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : ""); string cityUrl = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : ""; //判断城市是否需要爬取 if (citys.Where(obj => obj.Equals(cityName)).FirstOrDefault() == null) { continue; } //生成sql string execStr = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}"; 城市表 city = CityManager.Get城市_byLike城市名称(cityName); if (city != null && !citySb2.ToString().Contains(city.城市名称) && !citySb.ToString().Contains(city.城市名称) && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.中国房产超市_ID) == null) { string host = cityUrl.Replace("http://", "http://second."); string hostlist = host; Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(hostlist, "gb2312", 根页面正则字典集合, webObj, CityId); execStr = string.Format(execStr, WebsiteManager.中国房产超市, city.城市名称, host, hostlist, "", "1", "1"); if (根页面正则字典集合结果["总页数"].Count() < 1) { citySb2.Append(execStr).Append("\r\n"); continue; } citySb.Append(execStr).Append("\r\n"); } } string result = citySb.ToString(); string result2 = citySb2.ToString(); string result3 = citySb3.ToString(); List <string> test = citys.Where(obj => !result.Contains(obj)).ToList(); 导出任务计划配置文件(); }