public void start() { RegexInfo 总条数正则 = new RegexInfo("共找到<strong class=\"number orange\">([\\d]*)</strong>条", "$1"); RegexInfo cityRegexInfo = new RegexInfo("<div class=\"onCont\" id=\"c01\"[^<>]*>((?:(?!</div>).)*)</div>", "$1"); RegexInfo cityRegexInfo2 = new RegexInfo("(<a href=\"[^\"]+\"[^<>]*>[^<>]+</a>)", "$1"); Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>(); cityRegexDic.Add("城市列表Text", cityRegexInfo); Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://soufun.com/SoufunFamily.htm", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.搜房网_ID), CityId); string cityText = dicCitylistText["城市列表Text"].Count > 0 ? dicCitylistText["城市列表Text"][0] : ""; cityRegexDic.Add("城市列表", cityRegexInfo2); Dictionary <string, List <string> > dicCitylist = SpiderHelp.GetStrByRegex(cityText, cityRegexDic); List <string> cityList = dicCitylist["城市列表"]; StringBuilder citySb = new StringBuilder(); cityRegexDic.Add("总条数", 总条数正则); List <string> list2 = new List <string>(); foreach (string cityInfoStr in cityList) { RegexInfo regexCityName = new RegexInfo("<a href=\"[^\"]+\"[^<>]*>([^<>]+)</a>", "$1"); RegexInfo regexCityUrl = new RegexInfo("<a href=\"([^\"]+)\"[^<>]*>[^<>]+</a>", "$1"); Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>(); cityRegexDic2.Add("regexCityName", regexCityName); cityRegexDic2.Add("regexCityUrl", regexCityUrl); Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2); string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : ""); string cityUrl = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : ""; string execStr = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}"; 城市表 city = CityManager.Get城市_byLike城市名称(cityName); if (city != null && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.搜房网_ID) == null)//&& { string houseUrl1 = cityUrl.Replace("http://", "http://esf.").TrimEnd('/'); if (city.城市名称.Contains("北京")) { houseUrl1 = houseUrl1.Replace("bj.", ""); } string houseUrl2 = houseUrl1 + "/house/h316-j3100-w32/"; Dictionary <string, List <string> > dicCountlistText = SpiderHelp.GetHtmlByRegex(houseUrl2, "gbk", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.搜房网_ID), CityId); string count = dicCountlistText["总条数"].Count > 0 ? dicCountlistText["总条数"][0] : ""; if (!string.IsNullOrEmpty(count)) { execStr = string.Format(execStr, WebsiteManager.搜房网, city.城市名称, houseUrl1, houseUrl2, "", "4000", "2000"); citySb.Append(execStr).Append("\r\n"); list2.Add(execStr); } } } string result = citySb.ToString(); }
public void start() { RegexInfo cityRegexInfo = new RegexInfo("<dl id=\"clist\">((?:(?!</dl>).)*)</dl>", "$1"); RegexInfo cityRegexInfo2 = new RegexInfo("(<a href=\"[^\"]+\" onclick=\"co\\([^\"]+\">[^<>]+</a>)", "$1"); Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>(); cityRegexDic.Add("城市列表Text", cityRegexInfo); Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.58.com/ershoufang/changecity/", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.五八同城_ID), CityId); string cityText = dicCitylistText["城市列表Text"].Count > 0 ? dicCitylistText["城市列表Text"][0] : ""; cityRegexDic.Add("城市列表", cityRegexInfo2); Dictionary <string, List <string> > dicCitylist = SpiderHelp.GetStrByRegex(cityText, cityRegexDic); List <string> cityList = dicCitylist["城市列表"]; StringBuilder citySb = new StringBuilder(); foreach (string cityInfoStr in cityList) { RegexInfo regexCityName = new RegexInfo("<a href=\"[^\"]+\" onclick=\"co\\([^\"]+\">([^<>]+)</a>", "$1"); RegexInfo regexCityUrl = new RegexInfo("<a href=\"([^\"]+)\" onclick=\"co\\([^\"]+\">[^<>]+</a>", "$1"); Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>(); cityRegexDic2.Add("regexCityName", regexCityName); cityRegexDic2.Add("regexCityUrl", regexCityUrl); Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2); string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : ""); string cityUrl = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : ""; string execStr = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}"; 城市表 city = CityManager.Get城市_byLike城市名称(cityName); if (city != null && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.五八同城_ID) == null) { execStr = string.Format(execStr, WebsiteManager.五八同城, city.城市名称, cityUrl.Replace("/ershoufang", ""), cityUrl.TrimEnd('/') + "/", "", "2000", "2000"); citySb.Append(execStr).Append("\r\n"); } } string result = citySb.ToString(); 导出任务计划配置文件(); }
public void start() { 网站表 webObj = WebsiteManager.GetWebById(WebsiteManager.城市房产_ID); RegexInfo 总页数正则 = new RegexInfo("<div class=\"[^\"]*\"><span class='fl mr'>\\d+/(\\d+)</span>", "$1"); Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>(); 根页面正则字典集合.Add("总页数", 总页数正则); RegexInfo cityRegexInfo = new RegexInfo("(<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>[^<>]+</a>)", "$1"); //cityRegexInfo.RegexInfoList.Add(new RegexInfo("(<a[^<>]+href='http\\://[^\\.]+.cityhouse.cn'[^<>]*>[^<>]+</a>)", "$1")); Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>(); cityRegexDic.Add("城市列表", cityRegexInfo); Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.cityhouse.cn/city.html", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.城市房产_ID), CityId); List <string> cityList = dicCitylistText["城市列表"]; StringBuilder citySb = new StringBuilder(); StringBuilder citySb2 = new StringBuilder(); foreach (string cityInfoStr in cityList) { RegexInfo regexCityName = new RegexInfo("<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>([^<>]+)</a>", "$1"); RegexInfo regexCityUrl = new RegexInfo("<a[^<>]+href=[\",']{1,1}(http\\://[^\\.]+.cityhouse.cn)[\",']{1,1}[^<>]*>[^<>]+</a>", "$1"); Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>(); cityRegexDic2.Add("regexCityName", regexCityName); cityRegexDic2.Add("regexCityUrl", regexCityUrl); Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2); string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : ""); string cityUrl = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : ""; string execStr = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}"; 城市表 city = CityManager.Get城市_byLike城市名称(cityName); if (city != null && !citySb2.ToString().Contains(city.城市名称) && !citySb.ToString().Contains(city.城市名称) && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.城市房产_ID) == null) { Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(cityUrl + "/forsale/flist.html?ob=10", "utf-8", 根页面正则字典集合, webObj, CityId, referer: cityUrl + "/forsale/flist.html?ob=10"); execStr = string.Format(execStr, WebsiteManager.城市房产, city.城市名称, cityUrl, cityUrl + "/forsale/flist.html?ob=10", "", "2000", "2000"); if (根页面正则字典集合结果["总页数"].Count() < 1) { citySb2.Append(execStr).Append("\r\n"); continue; } citySb.Append(execStr).Append("\r\n"); } } string result = citySb.ToString(); string result2 = citySb2.ToString(); 导出任务计划配置文件(); }
public void start() { string[] citys = new string[] { "济宁", "杭州", "漳州", "威海", "宜昌", "北海", "包头", "滨州", "长春", "大连", "东营", "衡水", "湖州", "金华", "九江", "盐城", "嘉兴", "聊城", "芜湖", "临沂", "南通", "秦皇岛", "衢州", "日照", "上海", "石家庄", "绍兴", "泰安", "潍坊", "襄阳", "银川", "烟台", "淄博", "镇江" }; 网站表 webObj = WebsiteManager.GetWebById(WebsiteManager.中国房产超市_ID); RegexInfo 总页数正则 = new RegexInfo(">下一页</a><div[^<>]*><span[^<>]*>[^<>]*</span>/([^<>]+)<", "$1"); Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>(); 根页面正则字典集合.Add("总页数", 总页数正则); RegexInfo cityRegexListText = new RegexInfo("<span class=\"hui6c\"[^<>]*>((?:(?!</span).)*)</span>", "$1"); RegexInfo cityRegexInfo = new RegexInfo("(<a [^<>]+>[^<>]+</a>)", "$1"); Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>(); cityRegexDic.Add("城市列表文本", cityRegexListText); //获取所有城市的文本区域 Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.fccs.com/city/", "gb2312", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.中国房产超市_ID), CityId); string cityText = dicCitylistText["城市列表文本"].Count < 1 ? "" : dicCitylistText["城市列表文本"][0]; //从文本区域中提取城市列表 cityRegexDic.Add("城市列表", cityRegexInfo); dicCitylistText = SpiderHelp.GetStrByRegex(cityText, cityRegexDic); List <string> cityList = dicCitylistText["城市列表"]; StringBuilder citySb = new StringBuilder(); StringBuilder citySb2 = new StringBuilder(); StringBuilder citySb3 = new StringBuilder(); foreach (string cityInfoStr in cityList) { //获取城市名称+url RegexInfo regexCityName = new RegexInfo("<a [^<>]+>([^<>]+)</a>", "$1"); RegexInfo regexCityUrl = new RegexInfo("<a[^<>]+href=\"([^<>]+)\"[^<>]*>[^<>]+</a>", "$1"); Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>(); cityRegexDic2.Add("regexCityName", regexCityName); cityRegexDic2.Add("regexCityUrl", regexCityUrl); Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2); string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : ""); string cityUrl = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : ""; //判断城市是否需要爬取 if (citys.Where(obj => obj.Equals(cityName)).FirstOrDefault() == null) { continue; } //生成sql string execStr = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}"; 城市表 city = CityManager.Get城市_byLike城市名称(cityName); if (city != null && !citySb2.ToString().Contains(city.城市名称) && !citySb.ToString().Contains(city.城市名称) && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.中国房产超市_ID) == null) { string host = cityUrl.Replace("http://", "http://second."); string hostlist = host; Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(hostlist, "gb2312", 根页面正则字典集合, webObj, CityId); execStr = string.Format(execStr, WebsiteManager.中国房产超市, city.城市名称, host, hostlist, "", "1", "1"); if (根页面正则字典集合结果["总页数"].Count() < 1) { citySb2.Append(execStr).Append("\r\n"); continue; } citySb.Append(execStr).Append("\r\n"); } } string result = citySb.ToString(); string result2 = citySb2.ToString(); string result3 = citySb3.ToString(); List <string> test = citys.Where(obj => !result.Contains(obj)).ToList(); 导出任务计划配置文件(); }