Пример #1
0
        /// <summary>
        /// 整理各列字符串格式
        /// </summary>
        /// <param name="newHouse"></param>
        /// <returns></returns>
        public static NewHouse ToColumnStr(this NewHouse newHouse)
        {
            //整理数据字符串
            newHouse.Lpm   = StringHelp.TrimBlank(newHouse.Lpm).ToRemoveSpe();
            newHouse.Xzq   = StringHelp.TrimBlank(newHouse.Xzq.Trim().ToRemoveSpe());
            newHouse.Jg    = string.IsNullOrEmpty(StringHelp.TrimBlank(newHouse.Jg)) ? "平面" : StringHelp.TrimBlank(newHouse.Jg);
            newHouse.Zj    = StringHelp.TrimBlank(newHouse.Zj);
            newHouse.Cx    = StringHelp.TrimBlank(newHouse.Cx).ToRemoveSpe();
            newHouse.Phone = StringHelp.TrimBlank(newHouse.Phone).ToRemoveSpe();
            newHouse.Mj    = Regex.Replace(newHouse.Mj, @"\..*", "", RegexOptions.IgnoreCase);
            newHouse.Dj    = Regex.Replace(newHouse.Dj, @"\..*", "", RegexOptions.IgnoreCase);
            newHouse.Hymj  = Regex.Replace(newHouse.Hymj, @"\..*", "", RegexOptions.IgnoreCase);
            newHouse.Dxsmj = Regex.Replace(newHouse.Dxsmj, @"\..*", "", RegexOptions.IgnoreCase);
            //计算数据
            newHouse.Jzlx = SpiderHelp.GetBuildingType(newHouse.Zlc);                //获取计算建筑类型
            newHouse.Yt   = SpiderHelp.GetHousePurposes(newHouse.Mj, newHouse.Jzlx); //获取计算用途
            newHouse.Hx   = SpiderHelp.GetHouseType(newHouse.Hx).ToRemoveSpe();
            newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

            if (!StringHelp.CheckStrIsDate(newHouse.Alsj))
            {
                newHouse.Alsj = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss");
            }
            else
            {
                newHouse.Alsj = Convert.ToDateTime(newHouse.Alsj).ToString("yyyy-MM-dd HH:mm:ss");
            }
            return(newHouse);
        }
Пример #2
0
        /// <summary>
        /// 保存数据
        /// </summary>
        /// <param name="newHouse"></param>
        public virtual void SaveNowData(NewHouse newHouse)
        {
            if (newHouse == null)
            {
                return;
            }
            //保存数据
            log.Debug(string.Format("{0}-数据保存中:网站:{1}--城市:{2}-(url:{3}--)", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), 网站名称, CityName, newHouse.Url));

            try
            {
                CaseManager.往案例表插入爬取数据(网站名称: 网站名称, 城市名称: CityName, 网站ID: 网站ID, 城市ID: CityId,
                                       楼盘名: newHouse.Lpm,
                                       案例时间: newHouse.Alsj,
                                       行政区: newHouse.Xzq,
                                       片区: newHouse.Pq,
                                       楼栋: newHouse.Ld,
                                       房号: newHouse.Fh,
                                       用途: newHouse.Yt,
                                       面积: newHouse.Mj,
                                       单价: newHouse.Dj,
                                       案例类型: newHouse.Allx,
                                       结构: newHouse.Jg,
                                       建筑类型: newHouse.Jzlx,
                                       总价: newHouse.Zj,
                                       所在楼层: newHouse.Szlc,
                                       总楼层: newHouse.Zlc,
                                       户型: newHouse.Hx,
                                       朝向: newHouse.Cx,
                                       装修: newHouse.Zx,
                                       建筑年代: newHouse.Jznd,
                                       信息: newHouse.Title,
                                       电话: newHouse.Phone,
                                       URL: newHouse.Url,
                                       币种: newHouse.Bz,
                                       地址: newHouse.Addres,
                                       创建时间: DateTime.Now,
                                       车位数量: newHouse.Cwsl,
                                       地下室面积: newHouse.Dxsmj,
                                       花园面积: newHouse.Hymj,
                                       建筑形式: newHouse.Jzxs,
                                       配套设施: newHouse.Ptss,
                                       厅结构: newHouse.Tjg,
                                       中介公司: newHouse.ComName,
                                       门店: newHouse.ComArea,
                                       startSpiderDate: nowDate
                                       );
            }
            catch (Exception ex)
            {
                log.Error(string.Format("数据保存中异常:网站:{0}--城市:{1}-(url:{2}--)", 网站名称, CityName, newHouse.Url), ex);
            }
        }
Пример #3
0
 /// <summary>
 /// 检查所有数据
 /// </summary>
 /// <param name="newHouse"></param>
 /// <param name="cityName"></param>
 /// <param name="message"></param>
 /// <returns></returns>
 public static bool CheckHouseAll(NewHouse newHouse, string cityName, out string message)
 {
     message = "";
     if (!SpiderHelp.CheckHouseArea(newHouse.Mj))
     {
         message = string.Format("面积不合格,下一个:{0}-(value_mj:{1},url:{2}--)", cityName, newHouse.Mj, newHouse.Url);
         return(false);
     }
     if (!SpiderHelp.CheckHouseUnitPrice(newHouse.Dj))
     {
         message = string.Format("单价不合格,下一个:{0}-(value_dj:{1},url:{2}--)", cityName, newHouse.Dj, newHouse.Url);
         return(false);
     }
     if (!SpiderHelp.CheckHouseFloor(newHouse.Zlc, newHouse.Szlc))
     {
         message = string.Format("所在楼层或总楼层不合格,下一个:{0}-(value_zlc:{1},value_szlc:{1},url:{2}--)", cityName, newHouse.Zlc, newHouse.Szlc, newHouse.Url);
         return(false);
     }
     return(true);
 }
Пример #4
0
        //static string key_jzlb = "建筑类别";
        //static string key_cqxz = "产权性质";
        //static string key_ptss = "配套设施";
        #endregion

        /// <summary>
        /// 数据保存到Excel
        /// </summary>
        /// <param name="path"></param>
        /// <param name="excelInfo"></param>
        public static void SaveExcel(string cityName, NewHouse newHouse)
        {
            //保存数据
            log.Debug(string.Format("Excel保存中:{0}-(url:{1}--)", cityName, newHouse.Url));

            string nowDate  = DateTime.Now.ToString("yyyy-MM-dd");
            string fileName = string.Format("{0}_{1}_{2}.xls", nowDate, newHouse.Wzly, cityName);
            string path     = SpiderHelp.GetConfigDire() + "DataSource\\" + fileName;


            string directory = Path.GetDirectoryName(path);

            if (!Directory.Exists(directory))
            {
                Directory.CreateDirectory(directory);
            }
            if (newHouse == null)
            {
                return;
            }
            Excel.Application app     = new Excel.Application();
            Excel.Workbook    book    = null;
            object            missing = System.Reflection.Missing.Value;

            try
            {
                int             nowRow     = 0;
                bool            existsFile = false;
                Excel.Worksheet sheet;
                if (File.Exists(path))
                {
                    existsFile = true;
                    app.Workbooks.Open(path, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing);
                    book   = (Excel.Workbook)app.ActiveWorkbook;
                    sheet  = (Excel.Worksheet)book.Sheets[1];
                    nowRow = sheet.UsedRange.Cells.Rows.Count;
                }
                else
                {
                    app.Application.Workbooks.Add(true);
                    book               = (Excel.Workbook)app.ActiveWorkbook;
                    sheet              = (Excel.Worksheet)book.ActiveSheet;
                    sheet.Cells[1, 1]  = SaveData.key_lpm;
                    sheet.Cells[1, 2]  = SaveData.key_alsj;
                    sheet.Cells[1, 3]  = SaveData.key_xzq;
                    sheet.Cells[1, 4]  = SaveData.key_pq;
                    sheet.Cells[1, 5]  = SaveData.key_ld;
                    sheet.Cells[1, 6]  = SaveData.key_fh;
                    sheet.Cells[1, 7]  = SaveData.key_yt;
                    sheet.Cells[1, 8]  = SaveData.key_mj;
                    sheet.Cells[1, 9]  = SaveData.key_dj;
                    sheet.Cells[1, 10] = SaveData.key_allx;
                    sheet.Cells[1, 11] = SaveData.key_jg;
                    sheet.Cells[1, 12] = SaveData.key_jzlx;
                    sheet.Cells[1, 13] = SaveData.key_zj;
                    sheet.Cells[1, 14] = SaveData.key_szlc;
                    sheet.Cells[1, 15] = SaveData.key_zlc;
                    sheet.Cells[1, 16] = SaveData.key_hx;
                    sheet.Cells[1, 17] = SaveData.key_cx;
                    sheet.Cells[1, 18] = SaveData.key_zx;
                    sheet.Cells[1, 19] = SaveData.key_jznd;
                    sheet.Cells[1, 20] = SaveData.key_title;
                    sheet.Cells[1, 21] = SaveData.key_phone;
                    sheet.Cells[1, 22] = SaveData.key_url;
                    sheet.Cells[1, 23] = SaveData.key_bz;
                    sheet.Cells[1, 24] = SaveData.key_wzly;
                    sheet.Cells[1, 25] = SaveData.key_address;
                    nowRow             = 1;
                }
                sheet.Cells[nowRow + 1, 1]  = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Lpm) ? newHouse.Lpm : "");
                sheet.Cells[nowRow + 1, 2]  = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Alsj) ? newHouse.Alsj : "");
                sheet.Cells[nowRow + 1, 3]  = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Xzq) ? newHouse.Xzq : "");
                sheet.Cells[nowRow + 1, 4]  = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Pq) ? newHouse.Pq : "");
                sheet.Cells[nowRow + 1, 5]  = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Ld) ? newHouse.Ld : "");
                sheet.Cells[nowRow + 1, 6]  = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Fh) ? newHouse.Fh : "");
                sheet.Cells[nowRow + 1, 7]  = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Yt) ? newHouse.Yt : "");
                sheet.Cells[nowRow + 1, 8]  = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Mj) ? newHouse.Mj : "");
                sheet.Cells[nowRow + 1, 9]  = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Dj) ? newHouse.Dj : "");
                sheet.Cells[nowRow + 1, 10] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Allx) ? newHouse.Allx : "");
                sheet.Cells[nowRow + 1, 11] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Jg) ? newHouse.Jg : "");
                sheet.Cells[nowRow + 1, 12] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Jzlx) ? newHouse.Jzlx : "");
                sheet.Cells[nowRow + 1, 13] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Zj) ? newHouse.Zj : "");
                sheet.Cells[nowRow + 1, 14] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Szlc) ? newHouse.Szlc : "");
                sheet.Cells[nowRow + 1, 15] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Zlc) ? newHouse.Zlc : "");
                sheet.Cells[nowRow + 1, 16] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Hx) ? newHouse.Hx : "");
                sheet.Cells[nowRow + 1, 17] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Cx) ? newHouse.Cx : "");
                sheet.Cells[nowRow + 1, 18] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Zx) ? newHouse.Zx : "");
                sheet.Cells[nowRow + 1, 19] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Jznd) ? newHouse.Jznd : "");
                sheet.Cells[nowRow + 1, 20] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Title) ? newHouse.Title : "");
                sheet.Cells[nowRow + 1, 21] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Phone) ? newHouse.Phone : "");
                sheet.Cells[nowRow + 1, 22] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Url) ? newHouse.Url : "");
                sheet.Cells[nowRow + 1, 23] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Bz) ? newHouse.Bz : "");
                sheet.Cells[nowRow + 1, 24] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Wzly) ? newHouse.Wzly : "");
                sheet.Cells[nowRow + 1, 25] = ExcelReplaceStr(!string.IsNullOrEmpty(newHouse.Wzly) ? newHouse.Addres : "");

                //保存excel文件
                if (existsFile)
                {
                    book.Save();
                }
                else
                {
                    book.SaveCopyAs(path);
                }
                //关闭文件
                book.Close(false, missing, missing);
                //退出excel
                app.Quit();
            }
            catch (Exception ex)
            {
                log.Error(string.Format("title:{0}--lpm:{1}--excel导入异常", newHouse.Title, newHouse.Lpm), ex);
                if (book != null)
                {
                    book.Close(false, missing, missing);
                }
                app.Quit();
                System.Threading.Thread.Sleep(2000);
                //SaveExcel(path, excelInfo);
            }
        }
Пример #5
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        public void GetHouseByUrl(string url, string urlPanelHtml)
        {
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("regex_address", regex_address);
                dicRegexInfo.Add("regex_datetime", regex_datetime);
                dicRegexInfo.Add("regex_comName", regex_comName);
                dicRegexInfo.Add("regex_comArea", regex_comArea);
                dicRegexInfo.Add("regex_yt", regex_yt);
                Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >();

                //根据规则获取数据
                dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "gb2312", dicRegexInfo, WebObj, CityId, timeout: 30000);
                List <string> dateList         = SpiderHelp.GetStrByRegexByIndex(urlPanelHtml, regex_updatetime);
                string        value_title      = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0];
                string        value_lpm        = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0];
                string        value_xzq        = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0];
                string        value_pq         = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0];
                string        value_hx         = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0];
                string        value_mj         = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0];
                string        value_dj         = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0];
                string        value_zj         = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0];
                string        value_jznd       = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0];
                string        value_cx         = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0];
                string        value_szlc       = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0];
                string        value_zlc        = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0];
                string        value_jg         = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0];
                string        value_yt         = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0];
                string        value_zx         = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0];
                string        value_phone      = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0];
                string        value_address    = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0];
                string        value_datetime   = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0];
                string        value_comName    = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0];
                string        value_comArea    = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0];
                string        value_updatetime = dateList.Count < 1 ? "" : dateList[0];
                string        _value_yt        = CheckPurpose(value_yt);
                if (_value_yt == "0")
                {
                    log.Debug(string.Format("GetHouseByUrl()用途无效,url:{0}, cityName:{1},用途:{2}", url, CityName, Convert.ToString(value_yt)));
                    return;
                }
                value_cx = value_cx.Replace("朝", "").TrimBlank();
                value_yt = _value_yt;
                //将数据添加到字典
                NewHouse newHouse = new NewHouse(value_lpm, GetCaseDate(value_datetime, value_updatetime), value_xzq, value_pq, "", "", "", value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea);

                //当前数据为一天前的数据时
                newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

                if (!newHouse.Alsj.CheckStrIsDate())
                {
                    newHouse.Alsj = DateTime.Now.ToString();
                }
                //获取刚开始爬取时的小时单位
                int nowH = Convert.ToInt32(nowDate.ToString("HH"));
                if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1))
                    {
                        isNowPageStop = true;
                    }
                }
                else //如果是在12点之后开始 则析取当天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")))
                    {
                        isNowPageStop = true;
                    }
                }
                //由于类型页面多线程爬取,赞定为永不停止
                isNowPageStop = false;
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Пример #6
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        public void GetHouseByUrl(string url)
        {
            //url = "http://esf.hd.zhijia.com/374810.html";
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_yt", regex_yt);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("*regex_address", regex_address);
                dicRegexInfo.Add("regex_datetime", regex_datetime);
                dicRegexInfo.Add("regex_comName", regex_comName);
                dicRegexInfo.Add("regex_comArea", regex_comArea);
                Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >();

                //根据规则获取数据
                dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId, keepAlive: true);
                string value_title    = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0].Replace("&nbsp;", "");
                string value_lpm      = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0].TrimBlank();
                string value_xzq      = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0].TrimBlank();
                string value_pq       = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0].TrimBlank();
                string value_hx       = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0].TrimBlank();
                string value_mj       = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0].TrimBlank();
                string value_dj       = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0].TrimBlank();
                string value_zj       = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0].TrimBlank();
                string value_jznd     = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0].TrimBlank().Replace("&nbsp;", "");
                string value_cx       = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0].TrimBlank().Replace("&nbsp;", "");;
                string value_szlc     = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0].TrimBlank();
                string value_zlc      = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0].TrimBlank();
                string value_jg       = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0].TrimBlank();
                string value_yt       = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0].TrimBlank();
                string value_zx       = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0].TrimBlank().Replace("&nbsp;", "");;
                string value_phone    = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0].TrimBlank();
                string value_address  = dicRegexInfo_List["*regex_address"].Count < 1 ? "" : dicRegexInfo_List["*regex_address"][0].TrimBlank();
                string value_datetime = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0].Trim();
                string value_comName  = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0];
                string value_comArea  = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0];
                if (value_zx.Contains("无"))
                {
                    value_zx = "";
                }
                if (value_cx.Contains("无"))
                {
                    value_cx = "";
                }
                value_dj = value_dj.Replace(",", "");
                //如果所在楼层和总楼层顺序颠倒
                if (StringHelp.IsInteger(value_zlc.TrimBlank()) && StringHelp.IsInteger(value_szlc.TrimBlank()))
                {
                    if (Convert.ToInt32(value_szlc.TrimBlank()) > Convert.ToInt32(value_zlc.TrimBlank()))
                    {
                        string a = value_szlc.TrimBlank();
                        value_szlc = value_zlc.TrimBlank();
                        value_zlc  = a;
                    }
                }
                //将数据添加到字典 用于excel
                NewHouse newHouse = new NewHouse(value_lpm, GetCaseDate(value_datetime), value_xzq, value_pq, "", "", value_yt, value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea);

                //当前数据为一天前的数据时
                newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

                if (!newHouse.Alsj.CheckStrIsDate())
                {
                    newHouse.Alsj = DateTime.Now.AddDays(-1).ToString();
                }

                //获取刚开始爬取时的小时单位
                int nowH = Convert.ToInt32(nowDate.ToString("HH"));
                if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1))
                    {
                        isNowPageStop = true;
                    }
                }
                else //如果是在12点之后开始 则析取当天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")))
                    {
                        isNowPageStop = true;
                    }
                }
                isNowPageStop = false;
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Пример #7
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        public void GetHouseByUrl(string url)
        {
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_yt", regex_yt);
                dicRegexInfo.Add("regex_jzxs", regex_jzxs);
                dicRegexInfo.Add("regex_ptss", regex_ptss);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("regex_address", regex_address);
                dicRegexInfo.Add("regex_datetime", regex_datetime);
                dicRegexInfo.Add("regex_hymj", regex_hymj);
                dicRegexInfo.Add("regex_tjg", regex_tjg);
                dicRegexInfo.Add("regex_cwsl", regex_cwsl);
                dicRegexInfo.Add("regex_dxsmj", regex_dxsmj);
                dicRegexInfo.Add("regex_comName", regex_comName);
                dicRegexInfo.Add("regex_comArea", regex_comArea);
                Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >();

                //根据规则获取数据
                dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId);
                string value_title    = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0];
                string value_lpm      = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0];
                string value_xzq      = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0];
                string value_pq       = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0];
                string value_hx       = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0];
                string value_mj       = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0];
                string value_dj       = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0];
                string value_zj       = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0];
                string value_jznd     = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0];
                string value_cx       = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0];
                string value_szlc     = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0];
                string value_zlc      = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0];
                string value_jg       = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0];
                string value_zx       = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0];
                string value_yt       = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0];
                string value_jzxs     = dicRegexInfo_List["regex_jzxs"].Count < 1 ? "" : dicRegexInfo_List["regex_jzxs"][0];
                string value_ptss     = dicRegexInfo_List["regex_ptss"].Count < 1 ? "" : dicRegexInfo_List["regex_ptss"][0];
                string value_phone    = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0];
                string value_address  = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0];
                string value_datetime = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0];
                string value_hymj     = dicRegexInfo_List["regex_hymj"].Count < 1 ? "" : dicRegexInfo_List["regex_hymj"][0];
                string value_tjg      = dicRegexInfo_List["regex_tjg"].Count < 1 ? "" : dicRegexInfo_List["regex_tjg"][0];
                string value_cwsl     = dicRegexInfo_List["regex_cwsl"].Count < 1 ? "" : dicRegexInfo_List["regex_cwsl"][0];
                string value_dxsmj    = dicRegexInfo_List["regex_dxsmj"].Count < 1 ? "" : dicRegexInfo_List["regex_dxsmj"][0];
                string value_comName  = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0];
                string value_comArea  = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0];
                value_comName  = value_comName.Contains("独立") ? "" : value_comName;
                value_jznd     = 转换建筑年代(value_jznd);
                value_datetime = 转换案例时间(value_datetime);
                //将数据添加到字典
                NewHouse newHouse = new NewHouse(value_lpm, value_datetime, value_xzq, value_pq, "", "", "", value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, value_jzxs, value_hymj, value_tjg, value_cwsl, value_ptss, value_dxsmj, value_comName, value_comArea);

                //当前数据为一天前的数据时
                newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

                //获取刚开始爬取时的小时单位
                int nowH = Convert.ToInt32(nowDate.ToString("HH"));
                if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1))
                    {
                        isNowPageStop = true;
                    }
                }
                else //如果是在12点之后开始 则析取当天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")))
                    {
                        isNowPageStop = true;
                    }
                }
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Пример #8
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        public void GetHouseByUrl(string url)
        {
            //url = "http://sz.esf.sina.com.cn/detail/7946016";
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("regex_address", regex_address);
                dicRegexInfo.Add("regex_updatetime", regex_updatetime);
                dicRegexInfo.Add("regex_comName", regex_comName);
                dicRegexInfo.Add("regex_comArea", regex_comArea);
                Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >();

                //根据规则获取数据
                dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, NowPageEncoding, dicRegexInfo, WebObj, CityId);
                string value_title      = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0];
                string value_lpm        = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0];
                string value_xzq        = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0];
                string value_pq         = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0];
                string value_hx         = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0];
                string value_mj         = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0];
                string value_dj         = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0];
                string value_zj         = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0];
                string value_jznd       = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0];
                string value_cx         = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0];
                string value_szlc       = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0];
                string value_zlc        = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0];
                string value_jg         = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0];
                string value_zx         = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0];
                string value_phone      = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0];
                string value_address    = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0];
                string value_updatetime = dicRegexInfo_List["regex_updatetime"].Count < 1 ? "" : dicRegexInfo_List["regex_updatetime"][0];
                string value_comName    = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0];
                string value_comArea    = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0];
                //将数据添加到实体
                NewHouse newHouse = new NewHouse(value_lpm, value_updatetime, value_xzq, value_pq, "", "", "", value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea);
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Пример #9
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        /// <param name="_infoText">详细页对应列表页中的文本</param>
        public void GetHouseByUrl(string url, string _infoText)
        {
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_yt", regex_yt);
                dicRegexInfo.Add("regex_jzxs", regex_jzxs);
                dicRegexInfo.Add("regex_ptss", regex_ptss);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("regex_address", regex_address);
                //dicRegexInfo.Add("regex_datetime", regex_datetime);
                dicRegexInfo.Add("regex_hymj", regex_hymj);
                dicRegexInfo.Add("regex_tjg", regex_tjg);
                dicRegexInfo.Add("regex_cwsl", regex_cwsl);
                dicRegexInfo.Add("regex_dxsmj", regex_dxsmj);
                dicRegexInfo.Add("regex_userId", regex_userId);
                Dictionary <string, RegexInfo> dicRegexInfo2 = new Dictionary <string, RegexInfo>();
                dicRegexInfo2.Add("regex_datetime", regex_datetime);
                //根据规则获取数据
                Dictionary <string, List <string> > dicRegexInfo_List  = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId);
                Dictionary <string, List <string> > dicRegexInfo_List2 = SpiderHelp.GetStrByRegex(_infoText, dicRegexInfo2);
                string value_title    = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0];
                string value_lpm      = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0];
                string value_xzq      = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0];
                string value_pq       = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0];
                string value_hx       = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0];
                string value_mj       = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0];
                string value_dj       = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0];
                string value_zj       = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0];
                string value_jznd     = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0];
                string value_cx       = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0];
                string value_szlc     = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0];
                string value_zlc      = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0];
                string value_jg       = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0];
                string value_zx       = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0];
                string value_yt       = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0];
                string value_jzxs     = dicRegexInfo_List["regex_jzxs"].Count < 1 ? "" : dicRegexInfo_List["regex_jzxs"][0];
                string value_ptss     = dicRegexInfo_List["regex_ptss"].Count < 1 ? "" : dicRegexInfo_List["regex_ptss"][0];
                string value_phone    = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0];
                string value_address  = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0];
                string value_hymj     = dicRegexInfo_List["regex_hymj"].Count < 1 ? "" : dicRegexInfo_List["regex_hymj"][0];
                string value_tjg      = dicRegexInfo_List["regex_tjg"].Count < 1 ? "" : dicRegexInfo_List["regex_tjg"][0];
                string value_cwsl     = dicRegexInfo_List["regex_cwsl"].Count < 1 ? "" : dicRegexInfo_List["regex_cwsl"][0];
                string value_dxsmj    = dicRegexInfo_List["regex_dxsmj"].Count < 1 ? "" : dicRegexInfo_List["regex_dxsmj"][0];
                string value_userId   = dicRegexInfo_List["regex_userId"].Count < 1 ? "" : dicRegexInfo_List["regex_userId"][0];
                string value_datetime = dicRegexInfo_List2["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List2["regex_datetime"][0];
                //获取中介公司
                string userInfoUrl = "http://user.58.com/userdata?userid={0}&type=10";
                Dictionary <string, RegexInfo> dicRegexInfo3 = new Dictionary <string, RegexInfo>();
                dicRegexInfo3.Add("regex_comName", regex_comName);
                Dictionary <string, List <string> > dicRegexInfo_List3 = SpiderHelp.GetHtmlByRegex(string.Format(userInfoUrl, value_userId), "utf-8", dicRegexInfo3, WebObj, CityId);
                string value_comName = dicRegexInfo_List3["regex_comName"].Count < 1 ? "" : dicRegexInfo_List3["regex_comName"][0];

                //value_jznd = 转换建筑年代(value_jznd);
                value_datetime = 转换案例时间(value_datetime);
                //将数据添加到字典
                NewHouse newHouse = new NewHouse(value_lpm, value_datetime, value_xzq, value_pq, "", "", "", value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, value_jzxs, value_hymj, value_tjg, value_cwsl, value_ptss, value_dxsmj, value_comName, "");

                //当前数据为一天前的数据时
                newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

                //获取刚开始爬取时的小时单位
                int nowH = Convert.ToInt32(nowDate.ToString("HH"));
                if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1))
                    {
                        isNowPageStop = true;
                    }
                }
                else //如果是在12点之后开始 则析取当天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")))
                    {
                        isNowPageStop = true;
                    }
                }
                //由于类型页面多线程爬取,赞定为永不停止
                isNowPageStop = false;
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}--案例时间:{5}--爬取时间:{6}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm, newHouse.Alsj, nowDate));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Пример #10
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        public void GetHouseByUrl(string url)
        {
            try
            {
                //DateTime dt = DateTime.ParseExact("Tue Apr 29 15:17:39 GMT+08:00 2014", "    004 dd HH:mm:ss           yyyy", System.Globalization.CultureInfo.InvariantCulture);
                //string aaaa = DateTime.Parse("Tue Apr 29 15:17:39 GMT+08:00 2014", System.Globalization.CultureInfo.InvariantCulture).ToString();
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_yt", regex_yt);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("*regex_address", regex_address);
                dicRegexInfo.Add("regex_datetime", regex_datetime);
                dicRegexInfo.Add("regex_comName", regex_comName);
                dicRegexInfo.Add("regex_comArea", regex_comArea);
                Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >();

                //根据规则获取数据
                dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId);
                string value_title    = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0];
                string value_lpm      = dicRegexInfo_List["regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["regex_lpm"][0];
                string value_xzq      = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0];
                string value_pq       = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0];
                string value_hx       = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0];
                string value_mj       = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0];
                string value_dj       = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0];
                string value_zj       = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0];
                string value_jznd     = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0];
                string value_cx       = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0];
                string value_szlc     = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0];
                string value_zlc      = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0];
                string value_jg       = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0];
                string value_yt       = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0];
                string value_zx       = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0];
                string value_phone    = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0];
                string value_address  = dicRegexInfo_List["*regex_address"].Count < 1 ? "" : dicRegexInfo_List["*regex_address"][0];
                string value_datetime = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0];
                string value_comName  = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0];
                string value_comArea  = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0];
                value_lpm   = "无";
                value_title = value_title.Replace("&nbsp;", "");
                //如果所在楼层和总楼层顺序颠倒
                if (StringHelp.IsInteger(value_zlc.TrimBlank()) && StringHelp.IsInteger(value_szlc.TrimBlank()))
                {
                    if (Convert.ToInt32(value_szlc.TrimBlank()) > Convert.ToInt32(value_zlc.TrimBlank()))
                    {
                        string a = value_szlc.TrimBlank();
                        value_szlc = value_zlc.TrimBlank();
                        value_zlc  = a;
                    }
                }
                value_yt = CheckPurpose(value_yt);
                if (value_yt.Equals("0"))
                {
                    log.Debug(string.Format("GetHouseByUrl()用途无效,url:{0}, cityName:{1},用途:{2}", url, CityName, Convert.ToString(value_yt)));
                    return;
                }
                //将数据添加到字典 用于excel
                NewHouse newHouse = new NewHouse(value_lpm, value_datetime, value_xzq, value_pq, "", "", value_yt, value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea);

                //当前数据为一天前的数据时
                newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

                if (!newHouse.Alsj.CheckStrIsDate())
                {
                    List <string> timeList  = SpiderHelp.GetStrByRegexByIndex(value_datetime, time);
                    List <string> timeList2 = SpiderHelp.GetStrByRegexByIndex(value_datetime, time2);
                    if (timeList != null && timeList.Count > 0 && timeList2 != null && timeList2.Count > 0)
                    {
                        string nowTime   = timeList[0];
                        string monthStr  = timeList2[0];
                        string monthStrs = months.Where(obj => monthStr.ToLower().Contains(obj.Split(',')[0])).FirstOrDefault();
                        if (!string.IsNullOrEmpty(monthStrs))
                        {
                            newHouse.Alsj = string.Format(nowTime, monthStrs.Split(',')[1]);
                        }
                    }
                    if (!newHouse.Alsj.CheckStrIsDate())
                    {
                        newHouse.Alsj = DateTime.Now.AddDays(-1).ToString();
                    }
                } //获取刚开始爬取时的小时单位
                int nowH = Convert.ToInt32(nowDate.ToString("HH"));
                if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1))
                    {
                        isNowPageStop = true;
                    }
                }
                else //如果是在12点之后开始 则析取当天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")))
                    {
                        isNowPageStop = true;
                    }
                }
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }