/// <summary> /// 整理各列字符串格式 /// </summary> /// <param name="newHouse"></param> /// <returns></returns> public static NewHouse ToColumnStr(this NewHouse newHouse) { //整理数据字符串 newHouse.Lpm = StringHelp.TrimBlank(newHouse.Lpm).ToRemoveSpe(); newHouse.Xzq = StringHelp.TrimBlank(newHouse.Xzq.Trim().ToRemoveSpe()); newHouse.Jg = string.IsNullOrEmpty(StringHelp.TrimBlank(newHouse.Jg)) ? "平面" : StringHelp.TrimBlank(newHouse.Jg); newHouse.Zj = StringHelp.TrimBlank(newHouse.Zj); newHouse.Cx = StringHelp.TrimBlank(newHouse.Cx).ToRemoveSpe(); newHouse.Phone = StringHelp.TrimBlank(newHouse.Phone).ToRemoveSpe(); newHouse.Mj = Regex.Replace(newHouse.Mj, @"\..*", "", RegexOptions.IgnoreCase); newHouse.Dj = Regex.Replace(newHouse.Dj, @"\..*", "", RegexOptions.IgnoreCase); newHouse.Hymj = Regex.Replace(newHouse.Hymj, @"\..*", "", RegexOptions.IgnoreCase); newHouse.Dxsmj = Regex.Replace(newHouse.Dxsmj, @"\..*", "", RegexOptions.IgnoreCase); //计算数据 newHouse.Jzlx = SpiderHelp.GetBuildingType(newHouse.Zlc); //获取计算建筑类型 newHouse.Yt = SpiderHelp.GetHousePurposes(newHouse.Mj, newHouse.Jzlx); //获取计算用途 newHouse.Hx = SpiderHelp.GetHouseType(newHouse.Hx).ToRemoveSpe(); newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj; if (!StringHelp.CheckStrIsDate(newHouse.Alsj)) { newHouse.Alsj = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"); } else { newHouse.Alsj = Convert.ToDateTime(newHouse.Alsj).ToString("yyyy-MM-dd HH:mm:ss"); } return(newHouse); }
/// <summary> /// 保存数据 /// </summary> /// <param name="newHouse"></param> public virtual void SaveNowData(NewHouse newHouse) { if (newHouse == null) { return; } //保存数据 log.Debug(string.Format("{0}-数据保存中:网站:{1}--城市:{2}-(url:{3}--)", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), 网站名称, CityName, newHouse.Url)); try { CaseManager.往案例表插入爬取数据(网站名称: 网站名称, 城市名称: CityName, 网站ID: 网站ID, 城市ID: CityId, 楼盘名: newHouse.Lpm, 案例时间: newHouse.Alsj, 行政区: newHouse.Xzq, 片区: newHouse.Pq, 楼栋: newHouse.Ld, 房号: newHouse.Fh, 用途: newHouse.Yt, 面积: newHouse.Mj, 单价: newHouse.Dj, 案例类型: newHouse.Allx, 结构: newHouse.Jg, 建筑类型: newHouse.Jzlx, 总价: newHouse.Zj, 所在楼层: newHouse.Szlc, 总楼层: newHouse.Zlc, 户型: newHouse.Hx, 朝向: newHouse.Cx, 装修: newHouse.Zx, 建筑年代: newHouse.Jznd, 信息: newHouse.Title, 电话: newHouse.Phone, URL: newHouse.Url, 币种: newHouse.Bz, 地址: newHouse.Addres, 创建时间: DateTime.Now, 车位数量: newHouse.Cwsl, 地下室面积: newHouse.Dxsmj, 花园面积: newHouse.Hymj, 建筑形式: newHouse.Jzxs, 配套设施: newHouse.Ptss, 厅结构: newHouse.Tjg, 中介公司: newHouse.ComName, 门店: newHouse.ComArea, startSpiderDate: nowDate ); } catch (Exception ex) { log.Error(string.Format("数据保存中异常:网站:{0}--城市:{1}-(url:{2}--)", 网站名称, CityName, newHouse.Url), ex); } }
/// <summary> /// 检查所有数据 /// </summary> /// <param name="newHouse"></param> /// <param name="cityName"></param> /// <param name="message"></param> /// <returns></returns> public static bool CheckHouseAll(NewHouse newHouse, string cityName, out string message) { message = ""; if (!SpiderHelp.CheckHouseArea(newHouse.Mj)) { message = string.Format("面积不合格,下一个:{0}-(value_mj:{1},url:{2}--)", cityName, newHouse.Mj, newHouse.Url); return(false); } if (!SpiderHelp.CheckHouseUnitPrice(newHouse.Dj)) { message = string.Format("单价不合格,下一个:{0}-(value_dj:{1},url:{2}--)", cityName, newHouse.Dj, newHouse.Url); return(false); } if (!SpiderHelp.CheckHouseFloor(newHouse.Zlc, newHouse.Szlc)) { message = string.Format("所在楼层或总楼层不合格,下一个:{0}-(value_zlc:{1},value_szlc:{1},url:{2}--)", cityName, newHouse.Zlc, newHouse.Szlc, newHouse.Url); return(false); } return(true); }
//static string key_jzlb = "建筑类别"; //static string key_cqxz = "产权性质"; //static string key_ptss = "配套设施"; #endregion /// <summary> /// 数据保存到Excel /// </summary> /// <param name="path"></param> /// <param name="excelInfo"></param> public static void SaveExcel(string cityName, NewHouse newHouse) { //保存数据 log.Debug(string.Format("Excel保存中:{0}-(url:{1}--)", cityName, newHouse.Url)); string nowDate = DateTime.Now.ToString("yyyy-MM-dd"); string fileName = string.Format("{0}_{1}_{2}.xls", nowDate, newHouse.Wzly, cityName); string path = SpiderHelp.GetConfigDire() + "DataSource\\" + fileName; string directory = Path.GetDirectoryName(path); if (!Directory.Exists(directory)) { Directory.CreateDirectory(directory); } if (newHouse == null) { return; } Excel.Application app = new Excel.Application(); Excel.Workbook book = null; object missing = System.Reflection.Missing.Value; try { int nowRow = 0; bool existsFile = false; Excel.Worksheet sheet; if (File.Exists(path)) { existsFile = true; app.Workbooks.Open(path, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing); book = (Excel.Workbook)app.ActiveWorkbook; sheet = (Excel.Worksheet)book.Sheets[1]; nowRow = sheet.UsedRange.Cells.Rows.Count; } else { app.Application.Workbooks.Add(true); book = (Excel.Workbook)app.ActiveWorkbook; sheet = (Excel.Worksheet)book.ActiveSheet; sheet.Cells[1, 1] = SaveData.key_lpm; sheet.Cells[1, 2] = SaveData.key_alsj; sheet.Cells[1, 3] = SaveData.key_xzq; sheet.Cells[1, 4] = SaveData.key_pq; sheet.Cells[1, 5] = SaveData.key_ld; sheet.Cells[1, 6] = SaveData.key_fh; sheet.Cells[1, 7] = SaveData.key_yt; sheet.Cells[1, 8] = SaveData.key_mj; sheet.Cells[1, 9] = SaveData.key_dj; sheet.Cells[1, 10] = SaveData.key_allx; sheet.Cells[1, 11] = SaveData.key_jg; sheet.Cells[1, 12] = SaveData.key_jzlx; sheet.Cells[1, 13] = SaveData.key_zj; sheet.Cells[1, 14] = SaveData.key_szlc; sheet.Cells[1, 15] = SaveData.key_zlc; sheet.Cells[1, 16] = SaveData.key_hx; sheet.Cells[1, 17] = SaveData.key_cx; sheet.Cells[1, 18] = SaveData.key_zx; sheet.Cells[1, 19] = SaveData.key_jznd; sheet.Cells[1, 20] = SaveData.key_title; sheet.Cells[1, 21] = SaveData.key_phone; sheet.Cells[1, 22] = SaveData.key_url; sheet.Cells[1, 23] = SaveData.key_bz; sheet.Cells[1, 24] = SaveData.key_wzly; sheet.Cells[1, 25] = SaveData.key_address; nowRow = 1; } sheet.Cells[nowRow + 1, 1] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Lpm) ? newHouse.Lpm : ""); sheet.Cells[nowRow + 1, 2] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Alsj) ? newHouse.Alsj : ""); sheet.Cells[nowRow + 1, 3] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Xzq) ? newHouse.Xzq : ""); sheet.Cells[nowRow + 1, 4] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Pq) ? newHouse.Pq : ""); sheet.Cells[nowRow + 1, 5] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Ld) ? newHouse.Ld : ""); sheet.Cells[nowRow + 1, 6] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Fh) ? newHouse.Fh : ""); sheet.Cells[nowRow + 1, 7] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Yt) ? newHouse.Yt : ""); sheet.Cells[nowRow + 1, 8] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Mj) ? newHouse.Mj : ""); sheet.Cells[nowRow + 1, 9] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Dj) ? newHouse.Dj : ""); sheet.Cells[nowRow + 1, 10] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Allx) ? newHouse.Allx : ""); sheet.Cells[nowRow + 1, 11] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Jg) ? newHouse.Jg : ""); sheet.Cells[nowRow + 1, 12] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Jzlx) ? newHouse.Jzlx : ""); sheet.Cells[nowRow + 1, 13] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Zj) ? newHouse.Zj : ""); sheet.Cells[nowRow + 1, 14] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Szlc) ? newHouse.Szlc : ""); sheet.Cells[nowRow + 1, 15] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Zlc) ? newHouse.Zlc : ""); sheet.Cells[nowRow + 1, 16] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Hx) ? newHouse.Hx : ""); sheet.Cells[nowRow + 1, 17] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Cx) ? newHouse.Cx : ""); sheet.Cells[nowRow + 1, 18] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Zx) ? newHouse.Zx : ""); sheet.Cells[nowRow + 1, 19] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Jznd) ? newHouse.Jznd : ""); sheet.Cells[nowRow + 1, 20] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Title) ? newHouse.Title : ""); sheet.Cells[nowRow + 1, 21] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Phone) ? newHouse.Phone : ""); sheet.Cells[nowRow + 1, 22] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Url) ? newHouse.Url : ""); sheet.Cells[nowRow + 1, 23] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Bz) ? newHouse.Bz : ""); sheet.Cells[nowRow + 1, 24] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Wzly) ? newHouse.Wzly : ""); sheet.Cells[nowRow + 1, 25] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Wzly) ? newHouse.Addres : ""); //保存excel文件 if (existsFile) { book.Save(); } else { book.SaveCopyAs(path); } //关闭文件 book.Close(false, missing, missing); //退出excel app.Quit(); } catch (Exception ex) { log.Error(string.Format("title:{0}--lpm:{1}--excel导入异常", newHouse.Title, newHouse.Lpm), ex); if (book != null) { book.Close(false, missing, missing); } app.Quit(); System.Threading.Thread.Sleep(2000); //SaveExcel(path, excelInfo); } }
/// <summary> /// 根据详细页url获取信息 /// </summary> /// <param name="url">详细页url</param> public void GetHouseByUrl(string url, string urlPanelHtml) { try { //设置各字段规则(正则) Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>(); dicRegexInfo.Add("*regex_lpm", regex_lpm); dicRegexInfo.Add("*regex_xzq", regex_xzq); dicRegexInfo.Add("regex_pq", regex_pq); dicRegexInfo.Add("regex_hx", regex_hx); dicRegexInfo.Add("*regex_mj", regex_mj); dicRegexInfo.Add("*regex_dj", regex_dj); dicRegexInfo.Add("*regex_zj", regex_zj); dicRegexInfo.Add("regex_jznd", regex_jznd); dicRegexInfo.Add("regex_cx", regex_cx); dicRegexInfo.Add("regex_szlc", regex_szlc); dicRegexInfo.Add("regex_zlc", regex_zlc); dicRegexInfo.Add("regex_jg", regex_jg); dicRegexInfo.Add("regex_zx", regex_zx); dicRegexInfo.Add("regex_title", regex_title); dicRegexInfo.Add("regex_phone", regex_phone); dicRegexInfo.Add("regex_address", regex_address); dicRegexInfo.Add("regex_datetime", regex_datetime); dicRegexInfo.Add("regex_comName", regex_comName); dicRegexInfo.Add("regex_comArea", regex_comArea); dicRegexInfo.Add("regex_yt", regex_yt); Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >(); //根据规则获取数据 dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "gb2312", dicRegexInfo, WebObj, CityId, timeout: 30000); List <string> dateList = SpiderHelp.GetStrByRegexByIndex(urlPanelHtml, regex_updatetime); string value_title = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0]; string value_lpm = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0]; string value_xzq = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0]; string value_pq = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0]; string value_hx = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0]; string value_mj = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0]; string value_dj = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0]; string value_zj = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0]; string value_jznd = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0]; string value_cx = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0]; string value_szlc = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0]; string value_zlc = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0]; string value_jg = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0]; string value_yt = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0]; string value_zx = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0]; string value_phone = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0]; string value_address = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0]; string value_datetime = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0]; string value_comName = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0]; string value_comArea = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0]; string value_updatetime = dateList.Count < 1 ? "" : dateList[0]; string _value_yt = CheckPurpose(value_yt); if (_value_yt == "0") { log.Debug(string.Format("GetHouseByUrl()用途无效,url:{0}, cityName:{1},用途:{2}", url, CityName, Convert.ToString(value_yt))); return; } value_cx = value_cx.Replace("朝", "").TrimBlank(); value_yt = _value_yt; //将数据添加到字典 NewHouse newHouse = new NewHouse(value_lpm, GetCaseDate(value_datetime, value_updatetime), value_xzq, value_pq, "", "", "", value_mj, value_dj, "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd, value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea); //当前数据为一天前的数据时 newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj; if (!newHouse.Alsj.CheckStrIsDate()) { newHouse.Alsj = DateTime.Now.ToString(); } //获取刚开始爬取时的小时单位 int nowH = Convert.ToInt32(nowDate.ToString("HH")); if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1)) { isNowPageStop = true; } } else //如果是在12点之后开始 则析取当天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd"))) { isNowPageStop = true; } } //由于类型页面多线程爬取,赞定为永不停止 isNowPageStop = false; //保存数据 SaveNowData(newHouse); log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm)); } catch (Exception ex) { log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex); } }
/// <summary> /// 根据详细页url获取信息 /// </summary> /// <param name="url">详细页url</param> public void GetHouseByUrl(string url) { //url = "http://esf.hd.zhijia.com/374810.html"; try { //设置各字段规则(正则) Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>(); dicRegexInfo.Add("*regex_lpm", regex_lpm); dicRegexInfo.Add("*regex_xzq", regex_xzq); dicRegexInfo.Add("regex_pq", regex_pq); dicRegexInfo.Add("regex_hx", regex_hx); dicRegexInfo.Add("*regex_mj", regex_mj); dicRegexInfo.Add("*regex_dj", regex_dj); dicRegexInfo.Add("*regex_zj", regex_zj); dicRegexInfo.Add("regex_jznd", regex_jznd); dicRegexInfo.Add("regex_cx", regex_cx); dicRegexInfo.Add("regex_szlc", regex_szlc); dicRegexInfo.Add("regex_zlc", regex_zlc); dicRegexInfo.Add("regex_jg", regex_jg); dicRegexInfo.Add("regex_yt", regex_yt); dicRegexInfo.Add("regex_zx", regex_zx); dicRegexInfo.Add("regex_title", regex_title); dicRegexInfo.Add("regex_phone", regex_phone); dicRegexInfo.Add("*regex_address", regex_address); dicRegexInfo.Add("regex_datetime", regex_datetime); dicRegexInfo.Add("regex_comName", regex_comName); dicRegexInfo.Add("regex_comArea", regex_comArea); Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >(); //根据规则获取数据 dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId, keepAlive: true); string value_title = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0].Replace(" ", ""); string value_lpm = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0].TrimBlank(); string value_xzq = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0].TrimBlank(); string value_pq = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0].TrimBlank(); string value_hx = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0].TrimBlank(); string value_mj = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0].TrimBlank(); string value_dj = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0].TrimBlank(); string value_zj = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0].TrimBlank(); string value_jznd = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0].TrimBlank().Replace(" ", ""); string value_cx = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0].TrimBlank().Replace(" ", "");; string value_szlc = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0].TrimBlank(); string value_zlc = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0].TrimBlank(); string value_jg = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0].TrimBlank(); string value_yt = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0].TrimBlank(); string value_zx = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0].TrimBlank().Replace(" ", "");; string value_phone = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0].TrimBlank(); string value_address = dicRegexInfo_List["*regex_address"].Count < 1 ? "" : dicRegexInfo_List["*regex_address"][0].TrimBlank(); string value_datetime = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0].Trim(); string value_comName = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0]; string value_comArea = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0]; if (value_zx.Contains("无")) { value_zx = ""; } if (value_cx.Contains("无")) { value_cx = ""; } value_dj = value_dj.Replace(",", ""); //如果所在楼层和总楼层顺序颠倒 if (StringHelp.IsInteger(value_zlc.TrimBlank()) && StringHelp.IsInteger(value_szlc.TrimBlank())) { if (Convert.ToInt32(value_szlc.TrimBlank()) > Convert.ToInt32(value_zlc.TrimBlank())) { string a = value_szlc.TrimBlank(); value_szlc = value_zlc.TrimBlank(); value_zlc = a; } } //将数据添加到字典 用于excel NewHouse newHouse = new NewHouse(value_lpm, GetCaseDate(value_datetime), value_xzq, value_pq, "", "", value_yt, value_mj, value_dj, "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd, value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea); //当前数据为一天前的数据时 newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj; if (!newHouse.Alsj.CheckStrIsDate()) { newHouse.Alsj = DateTime.Now.AddDays(-1).ToString(); } //获取刚开始爬取时的小时单位 int nowH = Convert.ToInt32(nowDate.ToString("HH")); if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1)) { isNowPageStop = true; } } else //如果是在12点之后开始 则析取当天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd"))) { isNowPageStop = true; } } isNowPageStop = false; //保存数据 SaveNowData(newHouse); log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm)); } catch (Exception ex) { log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex); } }
/// <summary> /// 根据详细页url获取信息 /// </summary> /// <param name="url">详细页url</param> public void GetHouseByUrl(string url) { try { //设置各字段规则(正则) Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>(); dicRegexInfo.Add("*regex_lpm", regex_lpm); dicRegexInfo.Add("*regex_xzq", regex_xzq); dicRegexInfo.Add("regex_pq", regex_pq); dicRegexInfo.Add("regex_hx", regex_hx); dicRegexInfo.Add("*regex_mj", regex_mj); dicRegexInfo.Add("*regex_dj", regex_dj); dicRegexInfo.Add("*regex_zj", regex_zj); dicRegexInfo.Add("regex_jznd", regex_jznd); dicRegexInfo.Add("regex_cx", regex_cx); dicRegexInfo.Add("regex_szlc", regex_szlc); dicRegexInfo.Add("regex_zlc", regex_zlc); dicRegexInfo.Add("regex_jg", regex_jg); dicRegexInfo.Add("regex_zx", regex_zx); dicRegexInfo.Add("regex_yt", regex_yt); dicRegexInfo.Add("regex_jzxs", regex_jzxs); dicRegexInfo.Add("regex_ptss", regex_ptss); dicRegexInfo.Add("regex_title", regex_title); dicRegexInfo.Add("regex_phone", regex_phone); dicRegexInfo.Add("regex_address", regex_address); dicRegexInfo.Add("regex_datetime", regex_datetime); dicRegexInfo.Add("regex_hymj", regex_hymj); dicRegexInfo.Add("regex_tjg", regex_tjg); dicRegexInfo.Add("regex_cwsl", regex_cwsl); dicRegexInfo.Add("regex_dxsmj", regex_dxsmj); dicRegexInfo.Add("regex_comName", regex_comName); dicRegexInfo.Add("regex_comArea", regex_comArea); Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >(); //根据规则获取数据 dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId); string value_title = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0]; string value_lpm = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0]; string value_xzq = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0]; string value_pq = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0]; string value_hx = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0]; string value_mj = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0]; string value_dj = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0]; string value_zj = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0]; string value_jznd = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0]; string value_cx = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0]; string value_szlc = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0]; string value_zlc = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0]; string value_jg = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0]; string value_zx = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0]; string value_yt = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0]; string value_jzxs = dicRegexInfo_List["regex_jzxs"].Count < 1 ? "" : dicRegexInfo_List["regex_jzxs"][0]; string value_ptss = dicRegexInfo_List["regex_ptss"].Count < 1 ? "" : dicRegexInfo_List["regex_ptss"][0]; string value_phone = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0]; string value_address = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0]; string value_datetime = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0]; string value_hymj = dicRegexInfo_List["regex_hymj"].Count < 1 ? "" : dicRegexInfo_List["regex_hymj"][0]; string value_tjg = dicRegexInfo_List["regex_tjg"].Count < 1 ? "" : dicRegexInfo_List["regex_tjg"][0]; string value_cwsl = dicRegexInfo_List["regex_cwsl"].Count < 1 ? "" : dicRegexInfo_List["regex_cwsl"][0]; string value_dxsmj = dicRegexInfo_List["regex_dxsmj"].Count < 1 ? "" : dicRegexInfo_List["regex_dxsmj"][0]; string value_comName = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0]; string value_comArea = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0]; value_comName = value_comName.Contains("独立") ? "" : value_comName; value_jznd = 转换建筑年代(value_jznd); value_datetime = 转换案例时间(value_datetime); //将数据添加到字典 NewHouse newHouse = new NewHouse(value_lpm, value_datetime, value_xzq, value_pq, "", "", "", value_mj, value_dj, "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd, value_title, value_phone, url, "", 网站名称, value_address, value_jzxs, value_hymj, value_tjg, value_cwsl, value_ptss, value_dxsmj, value_comName, value_comArea); //当前数据为一天前的数据时 newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj; //获取刚开始爬取时的小时单位 int nowH = Convert.ToInt32(nowDate.ToString("HH")); if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1)) { isNowPageStop = true; } } else //如果是在12点之后开始 则析取当天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd"))) { isNowPageStop = true; } } //保存数据 SaveNowData(newHouse); log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm)); } catch (Exception ex) { log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex); } }
/// <summary> /// 根据详细页url获取信息 /// </summary> /// <param name="url">详细页url</param> public void GetHouseByUrl(string url) { //url = "http://sz.esf.sina.com.cn/detail/7946016"; try { //设置各字段规则(正则) Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>(); dicRegexInfo.Add("*regex_lpm", regex_lpm); dicRegexInfo.Add("*regex_xzq", regex_xzq); dicRegexInfo.Add("regex_pq", regex_pq); dicRegexInfo.Add("regex_hx", regex_hx); dicRegexInfo.Add("*regex_mj", regex_mj); dicRegexInfo.Add("*regex_dj", regex_dj); dicRegexInfo.Add("*regex_zj", regex_zj); dicRegexInfo.Add("regex_jznd", regex_jznd); dicRegexInfo.Add("regex_cx", regex_cx); dicRegexInfo.Add("regex_szlc", regex_szlc); dicRegexInfo.Add("regex_zlc", regex_zlc); dicRegexInfo.Add("regex_jg", regex_jg); dicRegexInfo.Add("regex_zx", regex_zx); dicRegexInfo.Add("regex_title", regex_title); dicRegexInfo.Add("regex_phone", regex_phone); dicRegexInfo.Add("regex_address", regex_address); dicRegexInfo.Add("regex_updatetime", regex_updatetime); dicRegexInfo.Add("regex_comName", regex_comName); dicRegexInfo.Add("regex_comArea", regex_comArea); Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >(); //根据规则获取数据 dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, NowPageEncoding, dicRegexInfo, WebObj, CityId); string value_title = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0]; string value_lpm = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0]; string value_xzq = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0]; string value_pq = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0]; string value_hx = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0]; string value_mj = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0]; string value_dj = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0]; string value_zj = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0]; string value_jznd = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0]; string value_cx = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0]; string value_szlc = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0]; string value_zlc = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0]; string value_jg = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0]; string value_zx = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0]; string value_phone = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0]; string value_address = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0]; string value_updatetime = dicRegexInfo_List["regex_updatetime"].Count < 1 ? "" : dicRegexInfo_List["regex_updatetime"][0]; string value_comName = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0]; string value_comArea = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0]; //将数据添加到实体 NewHouse newHouse = new NewHouse(value_lpm, value_updatetime, value_xzq, value_pq, "", "", "", value_mj, value_dj, "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd, value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea); //保存数据 SaveNowData(newHouse); log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm)); } catch (Exception ex) { log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex); } }
/// <summary> /// 根据详细页url获取信息 /// </summary> /// <param name="url">详细页url</param> /// <param name="_infoText">详细页对应列表页中的文本</param> public void GetHouseByUrl(string url, string _infoText) { try { //设置各字段规则(正则) Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>(); dicRegexInfo.Add("*regex_lpm", regex_lpm); dicRegexInfo.Add("*regex_xzq", regex_xzq); dicRegexInfo.Add("regex_pq", regex_pq); dicRegexInfo.Add("regex_hx", regex_hx); dicRegexInfo.Add("*regex_mj", regex_mj); dicRegexInfo.Add("*regex_dj", regex_dj); dicRegexInfo.Add("*regex_zj", regex_zj); dicRegexInfo.Add("regex_jznd", regex_jznd); dicRegexInfo.Add("regex_cx", regex_cx); dicRegexInfo.Add("regex_szlc", regex_szlc); dicRegexInfo.Add("regex_zlc", regex_zlc); dicRegexInfo.Add("regex_jg", regex_jg); dicRegexInfo.Add("regex_zx", regex_zx); dicRegexInfo.Add("regex_yt", regex_yt); dicRegexInfo.Add("regex_jzxs", regex_jzxs); dicRegexInfo.Add("regex_ptss", regex_ptss); dicRegexInfo.Add("regex_title", regex_title); dicRegexInfo.Add("regex_phone", regex_phone); dicRegexInfo.Add("regex_address", regex_address); //dicRegexInfo.Add("regex_datetime", regex_datetime); dicRegexInfo.Add("regex_hymj", regex_hymj); dicRegexInfo.Add("regex_tjg", regex_tjg); dicRegexInfo.Add("regex_cwsl", regex_cwsl); dicRegexInfo.Add("regex_dxsmj", regex_dxsmj); dicRegexInfo.Add("regex_userId", regex_userId); Dictionary <string, RegexInfo> dicRegexInfo2 = new Dictionary <string, RegexInfo>(); dicRegexInfo2.Add("regex_datetime", regex_datetime); //根据规则获取数据 Dictionary <string, List <string> > dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId); Dictionary <string, List <string> > dicRegexInfo_List2 = SpiderHelp.GetStrByRegex(_infoText, dicRegexInfo2); string value_title = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0]; string value_lpm = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0]; string value_xzq = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0]; string value_pq = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0]; string value_hx = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0]; string value_mj = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0]; string value_dj = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0]; string value_zj = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0]; string value_jznd = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0]; string value_cx = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0]; string value_szlc = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0]; string value_zlc = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0]; string value_jg = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0]; string value_zx = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0]; string value_yt = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0]; string value_jzxs = dicRegexInfo_List["regex_jzxs"].Count < 1 ? "" : dicRegexInfo_List["regex_jzxs"][0]; string value_ptss = dicRegexInfo_List["regex_ptss"].Count < 1 ? "" : dicRegexInfo_List["regex_ptss"][0]; string value_phone = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0]; string value_address = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0]; string value_hymj = dicRegexInfo_List["regex_hymj"].Count < 1 ? "" : dicRegexInfo_List["regex_hymj"][0]; string value_tjg = dicRegexInfo_List["regex_tjg"].Count < 1 ? "" : dicRegexInfo_List["regex_tjg"][0]; string value_cwsl = dicRegexInfo_List["regex_cwsl"].Count < 1 ? "" : dicRegexInfo_List["regex_cwsl"][0]; string value_dxsmj = dicRegexInfo_List["regex_dxsmj"].Count < 1 ? "" : dicRegexInfo_List["regex_dxsmj"][0]; string value_userId = dicRegexInfo_List["regex_userId"].Count < 1 ? "" : dicRegexInfo_List["regex_userId"][0]; string value_datetime = dicRegexInfo_List2["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List2["regex_datetime"][0]; //获取中介公司 string userInfoUrl = "http://user.58.com/userdata?userid={0}&type=10"; Dictionary <string, RegexInfo> dicRegexInfo3 = new Dictionary <string, RegexInfo>(); dicRegexInfo3.Add("regex_comName", regex_comName); Dictionary <string, List <string> > dicRegexInfo_List3 = SpiderHelp.GetHtmlByRegex(string.Format(userInfoUrl, value_userId), "utf-8", dicRegexInfo3, WebObj, CityId); string value_comName = dicRegexInfo_List3["regex_comName"].Count < 1 ? "" : dicRegexInfo_List3["regex_comName"][0]; //value_jznd = 转换建筑年代(value_jznd); value_datetime = 转换案例时间(value_datetime); //将数据添加到字典 NewHouse newHouse = new NewHouse(value_lpm, value_datetime, value_xzq, value_pq, "", "", "", value_mj, value_dj, "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd, value_title, value_phone, url, "", 网站名称, value_address, value_jzxs, value_hymj, value_tjg, value_cwsl, value_ptss, value_dxsmj, value_comName, ""); //当前数据为一天前的数据时 newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj; //获取刚开始爬取时的小时单位 int nowH = Convert.ToInt32(nowDate.ToString("HH")); if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1)) { isNowPageStop = true; } } else //如果是在12点之后开始 则析取当天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd"))) { isNowPageStop = true; } } //由于类型页面多线程爬取,赞定为永不停止 isNowPageStop = false; //保存数据 SaveNowData(newHouse); log.Debug(string.Format("{0}数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}--案例时间:{5}--爬取时间:{6}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm, newHouse.Alsj, nowDate)); } catch (Exception ex) { log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex); } }
/// <summary> /// 根据详细页url获取信息 /// </summary> /// <param name="url">详细页url</param> public void GetHouseByUrl(string url) { try { //DateTime dt = DateTime.ParseExact("Tue Apr 29 15:17:39 GMT+08:00 2014", " 004 dd HH:mm:ss yyyy", System.Globalization.CultureInfo.InvariantCulture); //string aaaa = DateTime.Parse("Tue Apr 29 15:17:39 GMT+08:00 2014", System.Globalization.CultureInfo.InvariantCulture).ToString(); //设置各字段规则(正则) Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>(); dicRegexInfo.Add("regex_lpm", regex_lpm); dicRegexInfo.Add("*regex_xzq", regex_xzq); dicRegexInfo.Add("regex_pq", regex_pq); dicRegexInfo.Add("regex_hx", regex_hx); dicRegexInfo.Add("*regex_mj", regex_mj); dicRegexInfo.Add("*regex_dj", regex_dj); dicRegexInfo.Add("*regex_zj", regex_zj); dicRegexInfo.Add("regex_jznd", regex_jznd); dicRegexInfo.Add("regex_cx", regex_cx); dicRegexInfo.Add("regex_szlc", regex_szlc); dicRegexInfo.Add("regex_zlc", regex_zlc); dicRegexInfo.Add("regex_jg", regex_jg); dicRegexInfo.Add("regex_yt", regex_yt); dicRegexInfo.Add("regex_zx", regex_zx); dicRegexInfo.Add("regex_title", regex_title); dicRegexInfo.Add("regex_phone", regex_phone); dicRegexInfo.Add("*regex_address", regex_address); dicRegexInfo.Add("regex_datetime", regex_datetime); dicRegexInfo.Add("regex_comName", regex_comName); dicRegexInfo.Add("regex_comArea", regex_comArea); Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >(); //根据规则获取数据 dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId); string value_title = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0]; string value_lpm = dicRegexInfo_List["regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["regex_lpm"][0]; string value_xzq = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0]; string value_pq = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0]; string value_hx = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0]; string value_mj = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0]; string value_dj = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0]; string value_zj = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0]; string value_jznd = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0]; string value_cx = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0]; string value_szlc = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0]; string value_zlc = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0]; string value_jg = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0]; string value_yt = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0]; string value_zx = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0]; string value_phone = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0]; string value_address = dicRegexInfo_List["*regex_address"].Count < 1 ? "" : dicRegexInfo_List["*regex_address"][0]; string value_datetime = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0]; string value_comName = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0]; string value_comArea = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0]; value_lpm = "无"; value_title = value_title.Replace(" ", ""); //如果所在楼层和总楼层顺序颠倒 if (StringHelp.IsInteger(value_zlc.TrimBlank()) && StringHelp.IsInteger(value_szlc.TrimBlank())) { if (Convert.ToInt32(value_szlc.TrimBlank()) > Convert.ToInt32(value_zlc.TrimBlank())) { string a = value_szlc.TrimBlank(); value_szlc = value_zlc.TrimBlank(); value_zlc = a; } } value_yt = CheckPurpose(value_yt); if (value_yt.Equals("0")) { log.Debug(string.Format("GetHouseByUrl()用途无效,url:{0}, cityName:{1},用途:{2}", url, CityName, Convert.ToString(value_yt))); return; } //将数据添加到字典 用于excel NewHouse newHouse = new NewHouse(value_lpm, value_datetime, value_xzq, value_pq, "", "", value_yt, value_mj, value_dj, "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd, value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea); //当前数据为一天前的数据时 newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj; if (!newHouse.Alsj.CheckStrIsDate()) { List <string> timeList = SpiderHelp.GetStrByRegexByIndex(value_datetime, time); List <string> timeList2 = SpiderHelp.GetStrByRegexByIndex(value_datetime, time2); if (timeList != null && timeList.Count > 0 && timeList2 != null && timeList2.Count > 0) { string nowTime = timeList[0]; string monthStr = timeList2[0]; string monthStrs = months.Where(obj => monthStr.ToLower().Contains(obj.Split(',')[0])).FirstOrDefault(); if (!string.IsNullOrEmpty(monthStrs)) { newHouse.Alsj = string.Format(nowTime, monthStrs.Split(',')[1]); } } if (!newHouse.Alsj.CheckStrIsDate()) { newHouse.Alsj = DateTime.Now.AddDays(-1).ToString(); } } //获取刚开始爬取时的小时单位 int nowH = Convert.ToInt32(nowDate.ToString("HH")); if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1)) { isNowPageStop = true; } } else //如果是在12点之后开始 则析取当天的数据 { if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd"))) { isNowPageStop = true; } } //保存数据 SaveNowData(newHouse); log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm)); } catch (Exception ex) { log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex); } }