Exemplo n.º 1
0
        public void Run()
        {
            List <string> orderList  = new List <string>();
            string        configPath = AppDomain.CurrentDomain.BaseDirectory + "IP代理/Config/TaobaoProxyIpConfig.txt";
            StreamReader  sr         = new StreamReader(configPath);

            while (true)
            {
                string str = sr.ReadLine();
                if (str == null)
                {
                    break;
                }
                else
                {
                    orderList.Add(str);
                }
            }
            sr.Close();
            sr.Dispose();
            string          url       = "http://121.199.38.28/ip/?tid={0}&num=1&ports=80,808,3128&filter=on";
            string          error     = "ERROR|订单剩余数量不足";
            SysData_ProxyIp existsObj = new SysData_ProxyIp();
            string          message   = "";

            foreach (string orderStr in orderList)
            {
                string requestUrl = string.Format(url, orderStr);
                while (true)
                {
                    string ipHtml = "";
begin:
                    try
                    {
                        ipHtml = SpiderHelp.GetHtml(requestUrl, "utf-8");
                    }
                    catch (Exception ex)
                    {
                        System.Threading.Thread.Sleep(3000);
                        goto begin;
                    }
                    if (ipHtml.Contains(error))
                    {
                        break;
                    }
                    int result = ProxyIpHelp.ImportProxyIp(ipHtml, "匿名", out existsObj, out message);
                    if (result != 1)
                    {
                        log.Debug(string.Format("{0},订单:{1},url:{2},ip:{3}", message, orderStr, url, ipHtml == null ? "null" : ipHtml));
                        continue;
                    }
                    else
                    {
                        log.Debug(string.Format("ip插入成功,订单:{0},url:{1},ip:{2}", orderStr, url, ipHtml == null ? "null" : ipHtml));
                    }
                }
                log.Debug(string.Format("订单:{0}导入完成,url:{1}", orderStr, url));
            }
            log.Debug("所有订单:导入完成,url:{1}");
        }
Exemplo n.º 2
0
        public void RunPageList(object url)
        {
            string _url        = Convert.ToString(url);
            string pageUrlPara = url + "&pageid={0}";

            try
            {
                SysData_ProxyIp existsObj = new SysData_ProxyIp();
                bool            notStop   = true;
                string          message   = "";
                int             pageIndex = 1;
                while (notStop)
                {
                    string paegUrl  = string.Format(pageUrlPara, pageIndex);
                    int    reqCount = 1;
reqBegin:
                    Dictionary <string, List <string> > dicValueList = SpiderHelp.GetHtmlByRegexNotProxyIp(paegUrl, "gb2312", regexDic);
                    List <string> ipInfoList = dicValueList["regex_ipInfo"];//所有ip集合
                    if (ipInfoList == null || ipInfoList.Count < 1)
                    {
                        if (reqCount < 3)
                        {
                            reqCount = reqCount + 1;
                            goto reqBegin;
                        }
                        else
                        {
                            pageIndex = 1;
                            log.Debug(string.Format("未获取到IP列表,url:{0}", url));
                            continue;
                        }
                    }
                    foreach (string ipInfo in ipInfoList)
                    {
                        Dictionary <string, List <string> > infoListDic = SpiderHelp.GetStrByRegex(ipInfo, regexDic);
                        string ip = infoListDic["regex_ip"].Count < 1 ? "" : infoListDic["regex_ip"][0];
                        ip = ip.TrimBlank();
                        int result = ProxyIpHelp.ImportProxyIp(ip, "匿名", out existsObj, out message);
                        if (result != 1)
                        {
                            log.Debug(string.Format("{0},url:{1},ip:{2}", message, url, ip == null ? "null" : ip));
                            continue;
                        }
                        else
                        {
                            log.Debug(string.Format("ip插入成功,url:{0},ip:{1}", url, ip == null ? "null" : ip));
                        }
                    }
                    pageIndex = pageIndex + 1;
                }
            }
            catch (Exception ex)
            {
                log.Error("系统异常", ex);
            }
        }
Exemplo n.º 3
0
 /// <summary>
 /// 根据列表页url获取详细信息url
 /// </summary>
 /// <param name="hotUrl">列表页域名</param>
 /// <param name="pageListUrl">列表页url</param>
 /// <param name="cityName">当前列表页对应的城市名称</param>
 /// <param name="rate">爬取频率(毫秒)</param>
 /// <param name="pageCheckRate">页面监测频率(毫秒)</param>
 /// <param name="下一页链接">输出下一页的链接</param>
 public void SpiderHouseByPageListUrl(string hostUrl, string pageListUrl, int rate, int pageCheckRate, out string 一页链接)
 {
     //pageListUrl = "http://esf.czfcw.com/rent_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_2_10912_0_0/";
     if (pageCheckRate > 0)
     {
         System.Threading.Thread.Sleep(pageCheckRate);
     }
     一页链接 = "";
     try
     {
         string url_sz = pageListUrl;
         Dictionary <string, RegexInfo> dicRegexItem = new Dictionary <string, RegexInfo>();
         dicRegexItem.Add("*regex_infPanel", regex_infPanel);
         dicRegexItem.Add("*regex_nextPage", regex_nextPage);
         //发送请求获取根据正则获取网页html信息
         Dictionary <string, List <string> > dicRegexItem_List = SpiderHelp.GetHtmlByRegex(url_sz, "gb2312", dicRegexItem, WebObj, CityId, timeout: 30000);
         List <string> list = dicRegexItem_List["*regex_infPanel"];
         一页链接 = dicRegexItem_List["*regex_nextPage"].Count < 1 ? "" : dicRegexItem_List["*regex_nextPage"][0];
         if (!一页链接.ToLower().Contains("http://"))
         {
             一页链接 = hostUrl + 一页链接;
         }
         foreach (string urlHtml in list)
         {
             if (rate > 0)
             {
                 System.Threading.Thread.Sleep(rate);
             }
             List <string> urlList = SpiderHelp.GetStrByRegexByIndex(urlHtml, regex_infUrl);
             if (urlList == null || urlList.Count < 1)
             {
                 log.Error(string.Format("SpiderHouseByPageListUrl()未获取到详细页面url,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListUrl, CityName));
                 continue;
             }
             string strUrl = urlList[0];
             string nowUrl = strUrl;
             //如果当前url不带域名
             if (!strUrl.ToLower().Contains("http://"))
             {
                 nowUrl = hostUrl + strUrl;
             }
             GetHouseByUrl(nowUrl, urlHtml);
             if (isNowPageStop)
             {
                 break;
             }
         }
     }
     catch (Exception ex)
     {
         log.Error(string.Format("SpiderHouse()异常,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListUrl, CityName), ex);
     }
 }
Exemplo n.º 4
0
        public void start()
        {
            RegexInfo 总条数正则          = new RegexInfo("共找到<strong class=\"number orange\">([\\d]*)</strong>条", "$1");
            RegexInfo cityRegexInfo  = new RegexInfo("<div class=\"onCont\" id=\"c01\"[^<>]*>((?:(?!</div>).)*)</div>", "$1");
            RegexInfo cityRegexInfo2 = new RegexInfo("(<a href=\"[^\"]+\"[^<>]*>[^<>]+</a>)", "$1");
            Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>();

            cityRegexDic.Add("城市列表Text", cityRegexInfo);
            Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://soufun.com/SoufunFamily.htm", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.搜房网_ID), CityId);
            string cityText = dicCitylistText["城市列表Text"].Count > 0 ? dicCitylistText["城市列表Text"][0] : "";

            cityRegexDic.Add("城市列表", cityRegexInfo2);
            Dictionary <string, List <string> > dicCitylist = SpiderHelp.GetStrByRegex(cityText, cityRegexDic);
            List <string> cityList = dicCitylist["城市列表"];
            StringBuilder citySb   = new StringBuilder();

            cityRegexDic.Add("总条数", 总条数正则);
            List <string> list2 = new List <string>();

            foreach (string cityInfoStr in cityList)
            {
                RegexInfo regexCityName = new RegexInfo("<a href=\"[^\"]+\"[^<>]*>([^<>]+)</a>", "$1");
                RegexInfo regexCityUrl  = new RegexInfo("<a href=\"([^\"]+)\"[^<>]*>[^<>]+</a>", "$1");
                Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>();
                cityRegexDic2.Add("regexCityName", regexCityName);
                cityRegexDic2.Add("regexCityUrl", regexCityUrl);
                Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2);
                string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : "");
                string cityUrl  = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : "";
                string execStr  = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}";

                城市表 city = CityManager.Get城市_byLike城市名称(cityName);
                if (city != null && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.搜房网_ID) == null)//&&
                {
                    string houseUrl1 = cityUrl.Replace("http://", "http://esf.").TrimEnd('/');
                    if (city.城市名称.Contains("北京"))
                    {
                        houseUrl1 = houseUrl1.Replace("bj.", "");
                    }
                    string houseUrl2 = houseUrl1 + "/house/h316-j3100-w32/";
                    Dictionary <string, List <string> > dicCountlistText = SpiderHelp.GetHtmlByRegex(houseUrl2, "gbk", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.搜房网_ID), CityId);
                    string count = dicCountlistText["总条数"].Count > 0 ? dicCountlistText["总条数"][0] : "";
                    if (!string.IsNullOrEmpty(count))
                    {
                        execStr = string.Format(execStr, WebsiteManager.搜房网, city.城市名称, houseUrl1, houseUrl2,
                                                "", "4000", "2000");
                        citySb.Append(execStr).Append("\r\n");
                        list2.Add(execStr);
                    }
                }
            }
            string result = citySb.ToString();
        }
Exemplo n.º 5
0
 /// <summary>
 /// 根据列表页url获取详细信息url
 /// </summary>
 /// <param name="hotUrl">列表页域名</param>
 /// <param name="pageListUrl">列表页url</param>
 /// <param name="cityName">当前列表页对应的城市名称</param>
 /// <param name="rate">爬取频率(毫秒)</param>
 /// <param name="pageCheckRate">页面监测频率(毫秒)</param>
 /// <param name="下一页链接">输出下一页的链接</param>
 public void SpiderHouseByPageListUrl(string hostUrl, string pageListUrl, int rate, int pageCheckRate, out string 一页链接)
 {
     if (pageCheckRate > 0)
     {
         System.Threading.Thread.Sleep(pageCheckRate);
     }
     一页链接 = "";
     try
     {
         string url_sz = pageListUrl;
         Dictionary <string, RegexInfo> dicRegexItem = new Dictionary <string, RegexInfo>();
         dicRegexItem.Add("*regex_infUrl", regex_infUrl);
         dicRegexItem.Add("regex_infUrl2", regex_infUrl2);
         dicRegexItem.Add("*regex_nextPage", regex_nextPage);
         //发送请求获取根据正则获取网页html信息
         Dictionary <string, List <string> > dicRegexItem_List = SpiderHelp.GetHtmlByRegex(url_sz, "utf-8", dicRegexItem, WebObj, CityId, referer: url_sz);
         List <string> list  = dicRegexItem_List["*regex_infUrl"];
         List <string> list2 = dicRegexItem_List["regex_infUrl2"];
         if (list2 != null && list2.Count() > 0)
         {
             list.AddRange(list2);
         }
         一页链接 = dicRegexItem_List["*regex_nextPage"].Count < 1 ? "" : dicRegexItem_List["*regex_nextPage"][0];
         if (!一页链接.ToLower().Contains("http://"))
         {
             一页链接 = hostUrl + 一页链接;
         }
         foreach (string strUrl in list)
         {
             if (rate > 0)
             {
                 System.Threading.Thread.Sleep(rate);
             }
             string nowUrl = strUrl;
             //如果当前url不带域名
             if (!strUrl.ToLower().Contains("http://"))
             {
                 nowUrl = hostUrl + strUrl;
             }
             GetHouseByUrl(nowUrl);
             if (isNowPageStop)
             {
                 break;
             }
         }
     }
     catch (Exception ex)
     {
         log.Error(string.Format("SpiderHouse()异常,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListUrl, CityName), ex);
     }
 }
Exemplo n.º 6
0
 /// <summary>
 /// 根据列表页url获取详细信息url
 /// </summary>
 /// <param name="hotUrl">列表页域名</param>
 /// <param name="pageListUrl">列表页url</param>
 /// <param name="cityName">当前列表页对应的城市名称</param>
 /// <param name="rate">爬取频率(毫秒)</param>
 /// <param name="pageCheckRate">页面监测频率(毫秒)</param>
 /// <param name="下一页链接">输出下一页的链接</param>
 public void SpiderHouseByPageListUrl(string hostUrl, string pageListUrl, int rate, int pageCheckRate, out string 一页链接)
 {
     if (pageCheckRate > 0)
     {
         System.Threading.Thread.Sleep(pageCheckRate);
     }
     一页链接 = "";
     try
     {
         string url_sz = pageListUrl;
         Dictionary <string, RegexInfo> dicRegexItem = new Dictionary <string, RegexInfo>();
         dicRegexItem.Add("*regex_infUrl", regex_infUrl);
         dicRegexItem.Add("*regex_nextPage", regex_nextPage);
         //发送请求获取根据正则获取网页html信息
         Dictionary <string, List <string> > dicRegexItem_List = SpiderHelp.GetHtmlByRegex(url_sz, "utf-8", dicRegexItem, WebObj, CityId, keepAlive: true);
         List <string> list = dicRegexItem_List["*regex_infUrl"];
         一页链接 = dicRegexItem_List["*regex_nextPage"].Count < 1 ? "" : dicRegexItem_List["*regex_nextPage"][0];
         if (!一页链接.ToLower().Contains("http://"))
         {
             一页链接 = hostUrl + 一页链接;
         }
         foreach (string strUrl in list)
         {
             if (rate > 0)
             {
                 System.Threading.Thread.Sleep(rate);
             }
             string nowUrl = strUrl;
             //如果当前url不带域名
             if (!strUrl.ToLower().Contains("http://"))
             {
                 nowUrl = hostUrl + strUrl;
             }
             //http://gz.mytophome.com/prop/view/18225321.html?c=so
             //http://gz.mytophome.com/prop/view/18724000.html?c=so
             //http://gz.mytophome.com/prop/view/18723412.html?c=so
             //http://dg.mytophome.com/prop/view/18470713.html?c=so
             GetHouseByUrl(nowUrl);
             if (isNowPageStop)
             {
                 break;
             }
         }
     }
     catch (Exception ex)
     {
         log.Error(string.Format("SpiderHouse()异常,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListUrl, CityName), ex);
     }
 }
Exemplo n.º 7
0
        public void start()
        {
            网站表       webObj = WebsiteManager.GetWebById(WebsiteManager.城市房产_ID);
            RegexInfo 总页数正则  = new RegexInfo("<div class=\"[^\"]*\"><span class='fl mr'>\\d+/(\\d+)</span>", "$1");
            Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();

            根页面正则字典集合.Add("总页数", 总页数正则);
            RegexInfo cityRegexInfo = new RegexInfo("(<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>[^<>]+</a>)", "$1");
            //cityRegexInfo.RegexInfoList.Add(new RegexInfo("(<a[^<>]+href='http\\://[^\\.]+.cityhouse.cn'[^<>]*>[^<>]+</a>)", "$1"));
            Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>();

            cityRegexDic.Add("城市列表", cityRegexInfo);
            Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.cityhouse.cn/city.html", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.城市房产_ID), CityId);
            List <string> cityList = dicCitylistText["城市列表"];
            StringBuilder citySb   = new StringBuilder();
            StringBuilder citySb2  = new StringBuilder();

            foreach (string cityInfoStr in cityList)
            {
                RegexInfo regexCityName = new RegexInfo("<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>([^<>]+)</a>", "$1");
                RegexInfo regexCityUrl  = new RegexInfo("<a[^<>]+href=[\",']{1,1}(http\\://[^\\.]+.cityhouse.cn)[\",']{1,1}[^<>]*>[^<>]+</a>", "$1");
                Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>();
                cityRegexDic2.Add("regexCityName", regexCityName);
                cityRegexDic2.Add("regexCityUrl", regexCityUrl);
                Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2);
                string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : "");
                string cityUrl  = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : "";
                string execStr  = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}";

                城市表 city = CityManager.Get城市_byLike城市名称(cityName);
                if (city != null && !citySb2.ToString().Contains(city.城市名称) && !citySb.ToString().Contains(city.城市名称) && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.城市房产_ID) == null)
                {
                    Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(cityUrl + "/forsale/flist.html?ob=10", "utf-8", 根页面正则字典集合, webObj, CityId, referer: cityUrl + "/forsale/flist.html?ob=10");

                    execStr = string.Format(execStr, WebsiteManager.城市房产, city.城市名称, cityUrl, cityUrl + "/forsale/flist.html?ob=10",
                                            "", "2000", "2000");
                    if (根页面正则字典集合结果["总页数"].Count() < 1)
                    {
                        citySb2.Append(execStr).Append("\r\n");
                        continue;
                    }
                    citySb.Append(execStr).Append("\r\n");
                }
            }
            string result  = citySb.ToString();
            string result2 = citySb2.ToString();

            导出任务计划配置文件();
        }
Exemplo n.º 8
0
        /// <summary>
        /// 根据列表页url获取详细信息url
        /// </summary>
        /// <param name="hotUrl">列表页域名</param>
        /// <param name="pageListUrl">列表页url</param>
        /// <param name="rate">爬取频率(毫秒)</param>
        /// <param name="pageCheckRate">页面监测频率(毫秒)</param>
        /// <param name="下一页链接">输出下一页的链接</param>
        public void SpiderHouseByPageListUrl(string hostUrl, string pageListUrl, int rate, int pageCheckRate, out string 一页链接)
        {
            if (pageCheckRate > 0)
            {
                System.Threading.Thread.Sleep(pageCheckRate);
            }
            一页链接 = "";
            try
            {
                string url_sz = pageListUrl;
                Dictionary <string, RegexInfo> dicRegexItem = new Dictionary <string, RegexInfo>();
                dicRegexItem.Add("*regex_infUrl", regex_infUrl);
                dicRegexItem.Add("*regex_nextPage", regex_nextPage);
                //发送请求获取根据正则获取网页html信息
                Dictionary <string, List <string> > dicRegexItem_List = SpiderHelp.GetHtmlByRegex(url_sz, NowPageEncoding, dicRegexItem, WebObj, CityId);
                List <string> list = dicRegexItem_List["*regex_infUrl"];
                一页链接 = dicRegexItem_List["*regex_nextPage"].Count < 1 ? "" : dicRegexItem_List["*regex_nextPage"][0];
                foreach (string strUrl in list)
                {
                    //if (rate > 0)
                    //{
                    //    System.Threading.Thread.Sleep(rate);
                    //}
                    string nowUrl = strUrl;
                    //如果当前url不带域名
                    if (!strUrl.ToLower().Contains("http://"))
                    {
                        nowUrl = hostUrl + strUrl;
                    }
beginPubUrl:
                    if (Url_workload.Count > 2000)
                    {
                        Thread.Sleep(5000);
                        goto beginPubUrl;
                    }
                    Url_workload.Enqueue(nowUrl);
                    //GetHouseByUrl(nowUrl);
                    //新浪Log.当前详细页面Url = nowUrl;
                    //新浪Log.更新时间 = DateTime.Now;
                    //新浪二手房LogManager.设置Log(新浪Log);
                    //GetHouseByUrl(nowUrl, cityName);//nowUrl"http://yn.esf.sina.com.cn/detail/493517" "http://sh.esf.sina.com.cn/detail/40594597/" "http://sh.esf.sina.com.cn/detail/40661944/"
                }
            }
            catch (Exception ex)
            {
                log.Error(string.Format("SpiderHouse()异常,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListUrl, CityName), ex);
            }
        }
Exemplo n.º 9
0
        public static 案例信息 爬取信息数据格式计算整理(案例信息 ent, string 案例时间, string 城市名称)
        {
            //整理数据字符串
            ent.总楼层String  = StringHelp.TrimBlank(ent.总楼层String).ToRemoveSpe(); ent.总楼层 = CaseColumnConvertToInt(ent.总楼层String);
            ent.所在楼层String = StringHelp.TrimBlank(ent.所在楼层String).ToRemoveSpe(); ent.所在楼层 = CaseColumnConvertToInt(ent.所在楼层String);
            ent.建筑形式       = StringHelp.TrimBlank(ent.建筑形式).ToRemoveSpe();
            ent.楼盘名        = StringHelp.TrimBlank(ent.楼盘名).ToRemoveSpe();
            ent.行政区        = StringHelp.TrimBlank(ent.行政区.Trim().ToRemoveSpe());
            ent.总价String   = StringHelp.TrimBlank(ent.总价String).ToRemoveSpe(); ent.总价 = CaseColumnConvertToDecimal(ent.总价String);
            ent.用途         = StringHelp.TrimBlank(ent.用途).ToRemoveSpe();
            ent.朝向         = SpiderHelp.处理朝向字符(StringHelp.TrimBlank(ent.朝向).ToRemoveSpe());
            if (朝向Manager.Get朝向_根据名称(ent.朝向) < 1)
            {
                ent.朝向 = "";
            }
            ent.电话          = StringHelp.TrimBlank(ent.电话).ToRemoveSpe();
            ent.面积String    = Regex.Replace(ent.面积String, @"\..*", "", RegexOptions.IgnoreCase); ent.面积 = CaseColumnConvertToDecimal(ent.面积String);
            ent.单价String    = Regex.Replace(ent.单价String, @"\..*", "", RegexOptions.IgnoreCase); ent.单价 = CaseColumnConvertToDecimal(ent.单价String);
            ent.花园面积String  = Regex.Replace(ent.花园面积String, @"\..*", "", RegexOptions.IgnoreCase); ent.花园面积 = CaseColumnConvertToDecimal(ent.花园面积String);
            ent.地下室面积String = Regex.Replace(ent.地下室面积String, @"\..*", "", RegexOptions.IgnoreCase); ent.地下室面积 = CaseColumnConvertToDecimal(ent.地下室面积String);
            ent.车位数量String  = StringHelp.TrimBlank(ent.车位数量String).ToRemoveSpe(); ent.车位数量 = CaseColumnConvertToInt(ent.车位数量String);
            ent.结构          = StringHelp.TrimBlank(ent.结构.Replace("平层", 结构Manager.平面));
            ent.户型          = Regex.Replace(ent.户型, @"(\d*厨|\d*卫)", "", RegexOptions.IgnoreCase);
            ent.户型          = ent.户型.Replace("室", "房");
            ent.户型          = StringHelp.NumberConvertToChinese(ent.户型);
            ent.案例类型        = StringHelp.TrimBlank(string.IsNullOrEmpty(ent.案例类型) ? 案例类型Manager.买卖报盘 : ent.案例类型);
            ent.币种          = StringHelp.TrimBlank(string.IsNullOrEmpty(ent.币种) ? 币种Manager.人民币 : ent.币种);
            ent.信息          = Regex.Replace(ent.信息, @"\\&[a-z0-9A-Z_]*;", "", RegexOptions.IgnoreCase);
            ent.地址          = Regex.Replace(ent.地址, @"\\&[a-z0-9A-Z_]*;", "", RegexOptions.IgnoreCase);
            //计算数据
            ent.结构   = SpiderHelp.获取户型结构(ent.结构);
            ent.建筑类型 = SpiderHelp.GetBuildingType(Convert.ToString(ent.总楼层));                                                                                                                    //获取计算建筑类型;
            ent.用途   = SpiderHelp.GetHousePurposes(ent.用途, 城市名称, Convert.ToString(ent.单价), Convert.ToString(ent.面积), Convert.ToString(ent.总楼层), Convert.ToString(ent.所在楼层), ent.建筑形式, ent.建筑类型); //获取计算用途
            ent.户型   = SpiderHelp.GetHouseType(ent.户型).ToRemoveSpe();
            案例时间     = 案例时间 != null?案例时间.Trim() : 案例时间;

            ent.装修 = SpiderHelp.获取装修类型(ent.装修);
            if (!StringHelp.CheckStrIsDate(案例时间))
            {
                ent.案例时间 = DateTime.Now;
            }
            else
            {
                ent.案例时间 = Convert.ToDateTime(案例时间);
            }
            return(ent);
        }
Exemplo n.º 10
0
        /// <summary>
        /// 根据列表页url获取详细信息url
        /// </summary>
        /// <param name="hotUrl">列表页域名</param>
        /// <param name="pageListUrl">列表页url</param>
        /// <param name="cityName">当前列表页对应的城市名称</param>
        /// <param name="rate">爬取频率(毫秒)</param>
        /// <param name="pageCheckRate">页面监测频率(毫秒)</param>
        /// <param name="下一页链接">输出下一页的链接</param>
        public void SpiderHouseByPageListUrl(string hostUrl, string pageListUrl, int rate, int pageCheckRate, out string 一页链接)
        {
            if (pageCheckRate > 0)
            {
                System.Threading.Thread.Sleep(pageCheckRate);
            }
            一页链接 = "";

            try
            {
                //pageListUrl = "http://sz.58.com/huangbeiling/ershoufang/pn50/?PGTID=14062595822230.7245780795346946&ClickID=2";
                string url_sz = pageListUrl;
                Dictionary <string, RegexInfo> dicRegexItem = new Dictionary <string, RegexInfo>();
                dicRegexItem.Add("*regex_nextPage", regex_nextPage);
                dicRegexItem.Add("*regex_listinfo", regex_listinfo);
                Dictionary <string, RegexInfo> dicRegexItem2 = new Dictionary <string, RegexInfo>();
                dicRegexItem2.Add("regex_infUrl", regex_infUrl);
                //发送请求获取根据正则获取网页html信息
                Dictionary <string, List <string> > dicRegexItem_List = SpiderHelp.GetHtmlByRegex(url_sz, "utf-8", dicRegexItem, WebObj, CityId);
                List <string> list = dicRegexItem_List["*regex_listinfo"];
                一页链接 = dicRegexItem_List["*regex_nextPage"].Count < 1 ? "" : dicRegexItem_List["*regex_nextPage"][0];
                foreach (string infoText in list)
                {
                    if (rate > 0)
                    {
                        System.Threading.Thread.Sleep(rate);
                    }
                    Dictionary <string, List <string> > infoText_list = SpiderHelp.GetStrByRegex(infoText, dicRegexItem2);
                    string nowUrl = infoText_list["regex_infUrl"].Count > 0 ? infoText_list["regex_infUrl"][0] : "";
                    //如果当前url不带域名
                    if (!nowUrl.ToLower().Contains("http://"))
                    {
                        nowUrl = hostUrl + nowUrl;
                    }
                    GetHouseByUrl(nowUrl, infoText);
                    if (isNowPageStop)
                    {
                        break;
                    }
                }
            }
            catch (Exception ex)
            {
                log.Error(string.Format("SpiderHouse()异常,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListUrl, CityName), ex);
            }
        }
Exemplo n.º 11
0
 /// <summary>
 /// 根据列表页url获取详细信息url
 /// </summary>
 /// <param name="hotUrl">列表页域名</param>
 /// <param name="pageListUrl">列表页url</param>
 /// <param name="cityName">当前列表页对应的城市名称</param>
 /// <param name="rate">爬取频率(毫秒)</param>
 /// <param name="pageCheckRate">页面监测频率(毫秒)</param>
 /// <param name="下一页链接">输出下一页的链接</param>
 public void SpiderHouseByPageListUrl(string hostUrl, string pageListUrl, int rate, int pageCheckRate, out string 一页链接)
 {
     if (pageCheckRate > 0)
     {
         System.Threading.Thread.Sleep(pageCheckRate);
     }
     一页链接 = "";
     try
     {
         string url_sz = pageListUrl;
         Dictionary <string, RegexInfo> dicRegexItem = new Dictionary <string, RegexInfo>();
         dicRegexItem.Add("*regex_infUrl", regex_infUrl);
         dicRegexItem.Add("*regex_nextPage", regex_nextPage);
         //发送请求获取根据正则获取网页html信息
         Dictionary <string, List <string> > dicRegexItem_List = SpiderHelp.GetHtmlByRegex(url_sz, "gb2312", dicRegexItem, WebObj, CityId);
         List <string> list = dicRegexItem_List["*regex_infUrl"];
         一页链接 = dicRegexItem_List["*regex_nextPage"].Count < 1 ? "" : dicRegexItem_List["*regex_nextPage"][0];
         foreach (string strUrl in list)
         {
             if (rate > 0)
             {
                 System.Threading.Thread.Sleep(rate);
             }
             string nowUrl = strUrl;
             //如果当前url不带域名
             if (!strUrl.ToLower().Contains("http://"))
             {
                 nowUrl = hostUrl + strUrl;
             }
             //http://bj.esf.focus.cn/view/22259695.html
             //http://bj.esf.focus.cn/view/22257098.html
             //http://bj.esf.focus.cn/view/19610838.html
             //http://bj.esf.focus.cn/view/22259638.html
             GetHouseByUrl(nowUrl);
             if (isNowPageStop)
             {
                 break;
             }
         }
     }
     catch (Exception ex)
     {
         log.Error(string.Format("SpiderHouse()异常,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListUrl, CityName), ex);
     }
 }
Exemplo n.º 12
0
        public static void get_配置任务计划文件字符串2(string fileName, DateTime 开始运行时间, int 间隔天数, int 运行超时时间_天, string 运行内容)
        {
            DateTime date = DateTime.Now;
            string   str  = 任务计划txt;

            str = str.Replace("{$当前日期}", date.ToString("yyyy-MM-dd"));
            str = str.Replace("{$当前日期}", date.ToString("HH:mm:ss.fffffff"));
            str = str.Replace("{$计算机名}", "fxt00010");
            str = str.Replace("{$计算机用户}", "Administrator");
            str = str.Replace("{$开始运行日期}", 开始运行时间.ToString("yyyy-MM-dd"));
            str = str.Replace("{$开始运行时间}", 开始运行时间.ToString("HH:mm:ss"));
            str = str.Replace("{$运行超时时间}", 运行超时时间_天.ToString() + "D");
            str = str.Replace("{$相隔天数}", 间隔天数.ToString());
            str = str.Replace("{$运行内容}", 运行内容);
            XmlDocument strxml = new XmlDocument();

            strxml.LoadXml(str);
            strxml.Save(SpiderHelp.GetConfigDire() + fileName);
        }
Exemplo n.º 13
0
        public void start()
        {
            RegexInfo cityRegexInfo  = new RegexInfo("<dl id=\"clist\">((?:(?!</dl>).)*)</dl>", "$1");
            RegexInfo cityRegexInfo2 = new RegexInfo("(<a href=\"[^\"]+\" onclick=\"co\\([^\"]+\">[^<>]+</a>)", "$1");
            Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>();

            cityRegexDic.Add("城市列表Text", cityRegexInfo);
            Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.58.com/ershoufang/changecity/", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.五八同城_ID), CityId);
            string cityText = dicCitylistText["城市列表Text"].Count > 0 ? dicCitylistText["城市列表Text"][0] : "";

            cityRegexDic.Add("城市列表", cityRegexInfo2);
            Dictionary <string, List <string> > dicCitylist = SpiderHelp.GetStrByRegex(cityText, cityRegexDic);
            List <string> cityList = dicCitylist["城市列表"];
            StringBuilder citySb   = new StringBuilder();

            foreach (string cityInfoStr in cityList)
            {
                RegexInfo regexCityName = new RegexInfo("<a href=\"[^\"]+\" onclick=\"co\\([^\"]+\">([^<>]+)</a>", "$1");
                RegexInfo regexCityUrl  = new RegexInfo("<a href=\"([^\"]+)\" onclick=\"co\\([^\"]+\">[^<>]+</a>", "$1");
                Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>();
                cityRegexDic2.Add("regexCityName", regexCityName);
                cityRegexDic2.Add("regexCityUrl", regexCityUrl);
                Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2);
                string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : "");
                string cityUrl  = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : "";
                string execStr  = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}";

                城市表 city = CityManager.Get城市_byLike城市名称(cityName);
                if (city != null && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.五八同城_ID) == null)
                {
                    execStr = string.Format(execStr, WebsiteManager.五八同城, city.城市名称, cityUrl.Replace("/ershoufang", ""), cityUrl.TrimEnd('/') + "/",
                                            "", "2000", "2000");
                    citySb.Append(execStr).Append("\r\n");
                }
            }
            string result = citySb.ToString();

            导出任务计划配置文件();
        }
Exemplo n.º 14
0
        public static string get_任务计划txt()
        {
            StringBuilder sb       = new StringBuilder();
            string        filaName = "任务计划\\计划任务.txt";
            FileStream    fs2      = new FileStream(SpiderHelp.GetConfigDire() + filaName, FileMode.Open, FileAccess.ReadWrite, FileShare.ReadWrite);
            StreamReader  sr       = new StreamReader(fs2, Encoding.UTF8);

            for (; ;)
            {
                string str = sr.ReadLine();
                if (str == null)
                {
                    break;
                }
                else
                {
                    sb.Append(str);
                }
            }
            fs2.Flush();
            sr.Close();
            fs2.Close();
            return(sb.ToString());
        }
Exemplo n.º 15
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="网站名称"></param>
        /// <param name="城市名称"></param>
        /// <param name="网站ID"></param>
        /// <param name="城市ID"></param>
        /// <param name="楼盘名"></param>
        /// <param name="案例时间"></param>
        /// <param name="行政区"></param>
        /// <param name="片区"></param>
        /// <param name="楼栋"></param>
        /// <param name="房号"></param>
        /// <param name="用途"></param>
        /// <param name="面积"></param>
        /// <param name="单价"></param>
        /// <param name="案例类型"></param>
        /// <param name="结构"></param>
        /// <param name="建筑类型"></param>
        /// <param name="总价"></param>
        /// <param name="所在楼层"></param>
        /// <param name="总楼层"></param>
        /// <param name="户型"></param>
        /// <param name="朝向"></param>
        /// <param name="装修"></param>
        /// <param name="建筑年代"></param>
        /// <param name="信息"></param>
        /// <param name="电话"></param>
        /// <param name="URL"></param>
        /// <param name="币种"></param>
        /// <param name="地址"></param>
        /// <param name="创建时间"></param>
        /// <param name="车位数量"></param>
        /// <param name="地下室面积"></param>
        /// <param name="花园面积"></param>
        /// <param name="建筑形式"></param>
        /// <param name="配套设施"></param>
        /// <param name="厅结构"></param>
        /// <param name="中介公司"></param>
        /// <param name="门店"></param>
        /// <param name="_db"></param>
        /// <returns>1:成功,0:失败,-1:失败,已存在</returns>
        public static int 往案例表插入爬取数据(string 网站名称, string 城市名称, int 网站ID, int 城市ID, string 楼盘名, string 案例时间, string 行政区,
                                     string 片区, string 楼栋, string 房号, string 用途, string 面积, string 单价,
                                     string 案例类型, string 结构, string 建筑类型, string 总价, string 所在楼层, string 总楼层,
                                     string 户型, string 朝向, string 装修, string 建筑年代, string 信息, string 电话,
                                     string URL, string 币种, string 地址, DateTime 创建时间, string 车位数量, string 地下室面积,
                                     string 花园面积, string 建筑形式, string 配套设施, string 厅结构, string 中介公司, string 门店, DateTime startSpiderDate, DataClass _db = null)
        {
work:
            if (!WorkItemManager.CheckPassSpider())//****检查数据库是否有维护程序在执行******//
            {
                System.Threading.Thread.Sleep(60000);
                goto work;
            }
            DataClass db = new DataClass(_db);

            try
            {
                //
                案例信息 caseObj = new 案例信息
                {
                    城市ID        = 城市ID,
                    网站ID        = 网站ID,
                    楼盘名         = 楼盘名.TrimBlank().RemoveHeml(),
                    行政区         = 行政区.TrimBlank().RemoveHeml(),
                    片区          = 片区.TrimBlank().RemoveHeml(),
                    楼栋          = 楼栋.GetSubstring(100),
                    房号          = 房号.GetSubstring(100),
                    用途          = 用途,
                    面积String    = 面积.TrimBlank().RemoveHeml(),
                    单价String    = 单价.TrimBlank().RemoveHeml(),
                    案例类型        = 案例类型.TrimBlank().RemoveHeml(),
                    结构          = 结构.TrimBlank().RemoveHeml(),
                    建筑类型        = 建筑类型.TrimBlank().RemoveHeml(),
                    总价String    = 总价.TrimBlank().RemoveHeml(),
                    所在楼层String  = 所在楼层.TrimBlank().RemoveHeml(),
                    总楼层String   = 总楼层.TrimBlank().RemoveHeml(),
                    户型          = 户型.TrimBlank().RemoveHeml(),
                    朝向          = 朝向.TrimBlank().RemoveHeml(),
                    装修          = 装修.TrimBlank().RemoveHeml().GetSubstring(50),
                    建筑年代        = 建筑年代.TrimBlank().RemoveHeml().GetSubstring(50),
                    信息          = 信息.RemoveHTMLTags(),
                    电话          = 电话.TrimBlank().RemoveHeml().GetSubstring(50),
                    URL         = URL,
                    币种          = 币种,
                    地址          = 地址.RemoveHeml(),
                    创建时间        = 创建时间,
                    建筑形式        = 建筑形式.TrimBlank().RemoveHeml().GetSubstring(100),
                    花园面积String  = 花园面积.TrimBlank().RemoveHeml(),
                    厅结构         = 厅结构.TrimBlank().RemoveHeml().GetSubstring(100),
                    车位数量String  = 车位数量.TrimBlank().RemoveHeml(),
                    配套设施        = 配套设施.RemoveHeml().GetSubstring(200),
                    地下室面积String = 地下室面积.TrimBlank().RemoveHeml(),
                    来源          = 网站名称
                };
                //验证传过来的用途是否存在
                if (!string.IsNullOrEmpty(StringHelp.TrimBlank(caseObj.用途).ToRemoveSpe()))
                {
                    if (caseObj.用途.Contains("其它") || 用途Manager.Get用途_根据名称(StringHelp.TrimBlank(caseObj.用途).ToRemoveSpe()) < 1)
                    {
                        caseObj.用途 = "";
                    }
                }
                //数据格式计算整理
                caseObj = 爬取信息数据格式计算整理(caseObj, 案例时间, 城市名称);
                //检查数据字符串
                if (caseObj.行政区.Contains("东莞") || caseObj.行政区.Contains("惠州"))
                {
                    db.Connection_Close();
                    db.Dispose();
                    return(0);
                }
                string checkMessage = "";
                if (!SpiderHelp.CheckHouseAll(caseObj, 城市名称, out checkMessage))
                {
                    db.Connection_Close();
                    db.Dispose();
                    log.Debug(checkMessage);
                    return(0);
                }
                //数据库插入操作
                int?案例类型ID = 案例类型Manager.Get案例类型_根据名称(caseObj.案例类型); caseObj.案例类型 = null;
                int?币种ID   = 币种Manager.Get币种_根据名称(caseObj.币种); caseObj.币种 = null;
                int?朝向ID   = 朝向Manager.Get朝向_根据名称(caseObj.朝向); caseObj.朝向 = null;
                int?户型ID   = 户型Manager.Get户型_根据名称(caseObj.户型); caseObj.户型 = null;
                int?建筑类型ID = 建筑类型Manager.Get建筑类型_根据名称(caseObj.建筑类型); caseObj.建筑类型 = null;
                int?结构ID   = 结构Manager.Get结构_根据名称(caseObj.结构); caseObj.结构 = null;
                int?用途ID   = 用途Manager.Get用途_根据名称(caseObj.用途); caseObj.用途 = null;
                //int? 装修ID = 装修Manager.Get装修_根据名称(caseObj.装修); caseObj.装修 = null;//暂时不限制
                caseObj.案例类型ID = Convert.ToInt32(案例类型ID); // == 0 ? null : 案例类型ID;
                caseObj.币种ID   = Convert.ToInt32(币种ID);   //== 0 ? null : 币种ID;
                caseObj.朝向ID   = Convert.ToInt32(朝向ID);   // == 0 ? null : 朝向ID;
                caseObj.户型ID   = Convert.ToInt32(户型ID);   // == 0 ? null : 户型ID;
                caseObj.建筑类型ID = Convert.ToInt32(建筑类型ID); // == 0 ? null : 建筑类型ID;
                caseObj.结构ID   = Convert.ToInt32(结构ID);   // == 0 ? null : 结构ID;
                caseObj.用途ID   = Convert.ToInt32(用途ID);   // == 0 ? null : 用途ID;
                //caseObj.装修ID = Convert.ToInt32(装修ID) == 0 ? null : 装修ID;//暂时不限制
                中介公司 = 中介公司.TrimBlank();
                门店   = 门店.TrimBlank();
                //获取中介公司
                //中介公司 = 中介公司 + "test";
                caseObj.CompanyId = 0;
                if (!string.IsNullOrEmpty(中介公司))
                {
                    SysData_Company com = CompanyManager.GetByCompanyName(中介公司, _db: db);
                    if (com == null)
                    {
                        com = CompanyManager.Insert(中介公司, _db: db);
                    }
                    caseObj.CompanyId = com.ID;
                }
                //获取中介公司门店
                //门店 = 门店 + "test";
                caseObj.CompanyAreaId = 0;
                if (!string.IsNullOrEmpty(门店))
                {
                    SysData_CompanyArea comArea = CompanyAreaManager.GetByCompanyAreaName(门店, _db: db);
                    if (comArea == null)
                    {
                        comArea = CompanyAreaManager.Insert(门店, _db: db);
                    }
                    caseObj.CompanyAreaId = comArea.ID;
                }
                //获取楼盘ID
                //caseObj.楼盘名 = caseObj.楼盘名 + "test";
                caseObj.ProjectId = 0;
                if (string.IsNullOrEmpty(caseObj.楼盘名))
                {
                    log.Debug(string.Format("数据保存结束:网站:{0}--城市:{1}-(url:{2}--),(楼盘名为null)", 网站名称, 城市名称, caseObj.URL));
                    db.Connection_Close();
                    db.Dispose();
                    return(0);
                }
                //if (!string.IsNullOrEmpty(caseObj.楼盘名))
                //{
                //    SysData_Project project = ProjectManager.GetProjectByProjectNameAndCityId(caseObj.楼盘名, 城市ID, _db: db);
                //    if (project == null)
                //    {
                //        project = ProjectManager.InsertProject(caseObj.楼盘名, 城市ID, 网站ID, _db: db);
                //    }
                //    caseObj.ProjectId = project.ID;
                //}
                //else
                //{
                //    log.Debug(string.Format("数据保存结束:网站:{0}--城市:{1}-(url:{2}--),(楼盘名为null)", 网站名称, 城市名称, caseObj.URL));
                //    db.Connection_Close();
                //    db.Dispose();
                //    return 0;
                //}
                //caseObj.楼盘名 = null;

                //获取行政区ID
                caseObj.AreaId = 0;
                if (!string.IsNullOrEmpty(caseObj.行政区))
                {
                    SysData_Area areaObj = AreaManager.GetAreaByAreaNameLikeByCityId(caseObj.行政区, 城市ID, _db: db);
                    if (areaObj == null)
                    {
                        areaObj = AreaManager.InsertArea(caseObj.行政区, 城市ID, 网站ID, _db: db);
                    }
                    caseObj.AreaId = areaObj.ID;
                }
                else
                {
                    caseObj.AreaId = 0;
                }
                caseObj.行政区 = null;
                //获取片区ID
                //caseObj.片区 = caseObj.片区 + "test";
                caseObj.SubAreaId = 0;
                if (!string.IsNullOrEmpty(caseObj.片区))
                {
                    SysData_SubArea subAreaObj = SubAreaManager.GetAreaByAreaNameByCityId(caseObj.片区, 城市ID, _db: db);
                    if (subAreaObj == null)
                    {
                        subAreaObj = SubAreaManager.InsertArea(caseObj.片区, 城市ID, 网站ID, _db: db);
                    }
                    caseObj.SubAreaId = subAreaObj.ID;
                }
                caseObj.片区 = null;
                //获取装修
                //caseObj.装修 = caseObj.装修 + "test";
                caseObj.装修ID = 0;
                if (!string.IsNullOrEmpty(caseObj.装修))
                {
                    SysData_装修 zhuangxiuObj = 装修Manager.Get装修_根据名称(caseObj.装修, db);
                    if (zhuangxiuObj == null)
                    {
                        zhuangxiuObj = 装修Manager.Insert(caseObj.装修, db);
                    }
                    caseObj.装修ID = zhuangxiuObj.ID;
                }
                caseObj.装修 = null;
                //验证数据是否存在
                案例信息 obj = CaseManager.GetCaseIdentical(caseObj, _db: db);
                if (obj != null)
                {
                    //记录个数
                    SpiderRepetitionLogManager.SetSpiderRepetitionCount(网站ID, 城市ID, startSpiderDate, db);
                    log.Debug(string.Format("{0}-数据保存结束:网站:{1}--城市:{2}-(url:{3}--),(数据已存在)", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), 网站名称, 城市名称, caseObj.URL));
                    db.Connection_Close();
                    db.Dispose();
                    return(-1);
                }
                Insert(caseObj, db);
                //db.DB.案例信息.InsertOnSubmit(caseObj);
                //db.DB.SubmitChanges();

                db.Connection_Close();
                db.Dispose();
            }
            catch (Exception ex)
            {
                log.Error(string.Format("{0}-系统异常:网站:{1}--城市:{2}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), 网站名称, 城市名称), ex);
                db.Connection_Close();
                db.Dispose();
                return(0);
            }
            return(1);
        }
Exemplo n.º 16
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        public void GetHouseByUrl(string url)
        {
            //url = "http://esf.hd.zhijia.com/374810.html";
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_yt", regex_yt);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("*regex_address", regex_address);
                dicRegexInfo.Add("regex_datetime", regex_datetime);
                dicRegexInfo.Add("regex_comName", regex_comName);
                dicRegexInfo.Add("regex_comArea", regex_comArea);
                Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >();

                //根据规则获取数据
                dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId, keepAlive: true);
                string value_title    = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0].Replace("&nbsp;", "");
                string value_lpm      = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0].TrimBlank();
                string value_xzq      = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0].TrimBlank();
                string value_pq       = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0].TrimBlank();
                string value_hx       = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0].TrimBlank();
                string value_mj       = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0].TrimBlank();
                string value_dj       = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0].TrimBlank();
                string value_zj       = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0].TrimBlank();
                string value_jznd     = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0].TrimBlank().Replace("&nbsp;", "");
                string value_cx       = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0].TrimBlank().Replace("&nbsp;", "");;
                string value_szlc     = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0].TrimBlank();
                string value_zlc      = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0].TrimBlank();
                string value_jg       = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0].TrimBlank();
                string value_yt       = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0].TrimBlank();
                string value_zx       = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0].TrimBlank().Replace("&nbsp;", "");;
                string value_phone    = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0].TrimBlank();
                string value_address  = dicRegexInfo_List["*regex_address"].Count < 1 ? "" : dicRegexInfo_List["*regex_address"][0].TrimBlank();
                string value_datetime = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0].Trim();
                string value_comName  = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0];
                string value_comArea  = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0];
                if (value_zx.Contains("无"))
                {
                    value_zx = "";
                }
                if (value_cx.Contains("无"))
                {
                    value_cx = "";
                }
                value_dj = value_dj.Replace(",", "");
                //如果所在楼层和总楼层顺序颠倒
                if (StringHelp.IsInteger(value_zlc.TrimBlank()) && StringHelp.IsInteger(value_szlc.TrimBlank()))
                {
                    if (Convert.ToInt32(value_szlc.TrimBlank()) > Convert.ToInt32(value_zlc.TrimBlank()))
                    {
                        string a = value_szlc.TrimBlank();
                        value_szlc = value_zlc.TrimBlank();
                        value_zlc  = a;
                    }
                }
                //将数据添加到字典 用于excel
                NewHouse newHouse = new NewHouse(value_lpm, GetCaseDate(value_datetime), value_xzq, value_pq, "", "", value_yt, value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea);

                //当前数据为一天前的数据时
                newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

                if (!newHouse.Alsj.CheckStrIsDate())
                {
                    newHouse.Alsj = DateTime.Now.AddDays(-1).ToString();
                }

                //获取刚开始爬取时的小时单位
                int nowH = Convert.ToInt32(nowDate.ToString("HH"));
                if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1))
                    {
                        isNowPageStop = true;
                    }
                }
                else //如果是在12点之后开始 则析取当天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")))
                    {
                        isNowPageStop = true;
                    }
                }
                isNowPageStop = false;
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Exemplo n.º 17
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="hostUrl"></param>
        /// <param name="pageListIndexUrl"></param>
        /// <param name="rate"></param>
        /// <param name="pageCheckRate"></param>
        public void SpiderHouse(string hostUrl, string pageListIndexUrl, int rate, int pageCheckRate)
        {
            #region 生成xml
            //StringBuilder stest = new StringBuilder();
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_lpm, "regex_lpm", "楼盘名"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_xzq, "regex_xzq", "行政区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_pq, "regex_pq", "片区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_yt, "regex_yt", "用途")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_hx, "regex_hx", "户型")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_mj, "regex_mj", "面积")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_dj, "regex_dj", "单价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zj, "regex_zj", "总价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jznd, "regex_jznd", "建筑年代")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_cx, "regex_cx", "朝向")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_szlc, "regex_szlc", "所在楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zlc, "regex_zlc", "总楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jg, "regex_jg", "结构")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zx, "regex_zx", "装修")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_title, "regex_title", "信息")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_phone, "regex_phone", "电话")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_address, "regex_address", "地址")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_datetime, "regex_datetime", "发布时间")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comName, "regex_comName", "公司")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comArea, "regex_comArea", "门店")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_infUrl, "regex_infUrl", "url")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_nextPage, "regex_nextPage", "下一页正则"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(总页数正则, "总页数正则", "总页数正则")); ;
            //string str = stest.ToString();
            #endregion
            if (pageCheckRate > 0)
            {
                System.Threading.Thread.Sleep(pageCheckRate);
            }
            //开始获取页面
            try
            {
                //*******************根页面下信息列表爬取***********************//
                Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();
                根页面正则字典集合.Add("*总页数", 总页数正则);
                Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(pageListIndexUrl, "utf-8", 根页面正则字典集合, WebObj, CityId, keepAlive: true);
                int    pageCount = 根页面正则字典集合结果["*总页数"].Count < 1 ? 0 : Convert.ToInt32(根页面正则字典集合结果["*总页数"][0]);
                string 根页面分页链接参数 = pageListIndexUrl.Replace("list.html", "list-page{0}.html");
                string 根页面下一页链接  = pageListIndexUrl;
                int    当前总页数     = pageCount;
                int    当前页码      = 1;

                log.Debug(string.Format("置家网SpiderHouse()--获取总页数{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", pageCount, hostUrl, pageListIndexUrl, CityName));
                while (!string.IsNullOrEmpty(根页面下一页链接))
                {
                    string nowPageList = 根页面下一页链接;
                    if (!根页面下一页链接.ToLower().Contains("http://"))
                    {
                        nowPageList = hostUrl + 根页面下一页链接;
                    }
                    log.Debug(string.Format("置家网SpiderHouse()--获取根页面下一页链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowPageList, hostUrl, pageListIndexUrl, CityName));
                    SpiderHouseByPageListUrl(hostUrl, nowPageList, rate, pageCheckRate, out 根页面下一页链接);
                    当前页码++;
                    //如果当前页码还不到最后一页&&但返回的下一页链接为null(用于封ip或者网络异常时)
                    if (string.IsNullOrEmpty(根页面下一页链接) && 当前页码 <= 当前总页数)
                    {
                        根页面下一页链接 = string.Format(根页面分页链接参数, 当前页码);
                    }
                    if (isNowPageStop)
                    {
                        break;
                    }
                }
                log.Debug(string.Format("置家网SpiderHouse()--获取根页面下信息吸取完成,{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", 根页面下一页链接, hostUrl, pageListIndexUrl, CityName));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("置家网SpiderHouse()异常,hostUrl:{0}, pageListIndexUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName), ex);
            }
            log.Debug(string.Format("置家网SpiderHouse()--获取{0}页面下详细信息Url吸取完成,详细页面url内容正则析取中-,hostUrl:{1}, pageListUrl:{2}", CityName, hostUrl, pageListIndexUrl));
        }
Exemplo n.º 18
0
        public void Run()
        {
            //ip列表页ip信息
            RegexInfo regex_ipInfo = new RegexInfo("([^<>]+\\@[^<>]+</font>)", "$1");
            //ip列表页ip
            RegexInfo regex_ip = new RegexInfo("([^<>]+)\\@[^<>]+</font>", "$1");
            //ip列表页ip区域
            RegexInfo regex_area = new RegexInfo("[^<>]+\\#([^<>]+)</font>", "$1");
            Dictionary <string, RegexInfo> regexDic1 = new Dictionary <string, RegexInfo>();

            regexDic1.Add("regex_ipInfo", regex_ipInfo);
            Dictionary <string, RegexInfo> regexDic2 = new Dictionary <string, RegexInfo>();

            regexDic2.Add("regex_ip", regex_ip);
            regexDic2.Add("regex_area", regex_area);
            //从文本文件中获取爬取ip分类页面列表
            List <string> urlList    = new List <string>();
            string        configPath = AppDomain.CurrentDomain.BaseDirectory + "IP代理/Config/Lastro网IP代理ProxyIpConfig.txt";
            StreamReader  sr         = new StreamReader(configPath);

            while (true)
            {
                string str = sr.ReadLine();
                if (str == null)
                {
                    break;
                }
                else
                {
                    urlList.Add(str);
                }
            }
            sr.Close();
            sr.Dispose();
            //但从当前页面中获取爬取ip分类列表
            RegexInfo regex_urllist                  = new RegexInfo("<a href=\"([^\"]+)\"[^<>]*>【国内代理】[^<>]+</a>", "$1");
            RegexInfo regex_urllist_pagecount        = new RegexInfo("<span title=\"共[^\"]+页\"> /([^<>]+)页", "$1");
            Dictionary <string, RegexInfo> regexDic3 = new Dictionary <string, RegexInfo>();

            regexDic3.Add("regex_urllist", regex_urllist);
            regexDic3.Add("regex_urllist_pagecount", regex_urllist_pagecount);
            int    urllist_index         = 1;
            int    urllist_max_index     = 1;
            string urllist_nextPage_para = "http://www.httpip.net/forum-36-{0}.html";           //下一页
            string urllist_nextPage      = string.Format(urllist_nextPage_para, urllist_index); //下一页

            try
            {
begin_nextpage:
                Dictionary <string, List <string> > dicValueList3 = SpiderHelp.GetHtmlByRegexNotProxyIp(urllist_nextPage, "gb2312", regexDic3);
                List <string> urlList2 = dicValueList3["regex_urllist"];//所有ip列表集合
                urlList.AddRange(urlList2);
                urllist_max_index = dicValueList3["regex_urllist_pagecount"].Count < 1 ? 0 : Convert.ToInt32(dicValueList3["regex_urllist_pagecount"][0].TrimBlank());
                if (urllist_index < urllist_max_index)
                {
                    urllist_index    = urllist_index + 1;
                    urllist_nextPage = string.Format(urllist_nextPage_para, urllist_index);
                }
                else
                {
                    urllist_nextPage = "";
                }
                //开始爬取当前页列表ip页面
                SysData_ProxyIp existsObj = new SysData_ProxyIp();
                string          message   = "";
                for (int i = 0; i < urlList.Count(); i++)
                {
                    string urlInfo = urlList[i];
                    string url     = urlInfo.Split('$')[0];
                    string urlHost = urlInfo.Split('$').Length < 2 ? "http://www.httpip.net/" : urlInfo.Split('$')[1];
requestpage:
                    if (!url.ToLower().Contains("http://"))
                    {
                        url = urlHost + url;
                    }
                    Dictionary <string, List <string> > dicValueList = SpiderHelp.GetHtmlByRegexNotProxyIp(url, "gb2312", regexDic1);
                    List <string> ipInfoList = dicValueList["regex_ipInfo"]; //所有ip集合
                    string        nextPage   = "";                           //下一页链接
                    if (ipInfoList == null || ipInfoList.Count < 1)
                    {
                        log.Debug(string.Format("未获取到IP列表,url:{0}", url));
                        continue;
                    }
                    foreach (string ipInfo in ipInfoList)
                    {
                        Dictionary <string, List <string> > infoListDic = SpiderHelp.GetStrByRegex(ipInfo, regexDic2);
                        string ip     = infoListDic["regex_ip"].Count < 1 ? "" : infoListDic["regex_ip"][0];
                        string ipArea = infoListDic["regex_area"].Count < 1 ? "" : infoListDic["regex_area"][0];
                        ip = ip.TrimBlank();
                        int result = ProxyIpHelp.ImportProxyIp(ip, ipArea, out existsObj, out message);
                        if (result != 1)
                        {
                            log.Debug(string.Format("{0},url:{1},ip:{2}", message, url, ip == null ? "null" : ip));
                            continue;
                        }
                        else
                        {
                            log.Debug(string.Format("ip插入成功,url:{0},ip:{1}", url, ip == null ? "null" : ip));
                        }
                    }
                    if (!string.IsNullOrEmpty(nextPage))
                    {
                        if (!nextPage.ToLower().Contains("http://"))
                        {
                            url = urlHost + nextPage;
                        }
                        goto requestpage;
                    }
                }
                if (!string.IsNullOrEmpty(urllist_nextPage))
                {
                    urlList = new List <string>();
                    goto begin_nextpage;
                }
            }
            catch (Exception ex)
            {
                log.Error("系统异常", ex);
            }
        }
Exemplo n.º 19
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        public void GetHouseByUrl(string url)
        {
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_yt", regex_yt);
                dicRegexInfo.Add("regex_jzxs", regex_jzxs);
                dicRegexInfo.Add("regex_ptss", regex_ptss);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("regex_address", regex_address);
                dicRegexInfo.Add("regex_datetime", regex_datetime);
                dicRegexInfo.Add("regex_hymj", regex_hymj);
                dicRegexInfo.Add("regex_tjg", regex_tjg);
                dicRegexInfo.Add("regex_cwsl", regex_cwsl);
                dicRegexInfo.Add("regex_dxsmj", regex_dxsmj);
                dicRegexInfo.Add("regex_comName", regex_comName);
                dicRegexInfo.Add("regex_comArea", regex_comArea);
                Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >();

                //根据规则获取数据
                dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId);
                string value_title    = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0];
                string value_lpm      = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0];
                string value_xzq      = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0];
                string value_pq       = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0];
                string value_hx       = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0];
                string value_mj       = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0];
                string value_dj       = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0];
                string value_zj       = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0];
                string value_jznd     = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0];
                string value_cx       = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0];
                string value_szlc     = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0];
                string value_zlc      = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0];
                string value_jg       = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0];
                string value_zx       = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0];
                string value_yt       = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0];
                string value_jzxs     = dicRegexInfo_List["regex_jzxs"].Count < 1 ? "" : dicRegexInfo_List["regex_jzxs"][0];
                string value_ptss     = dicRegexInfo_List["regex_ptss"].Count < 1 ? "" : dicRegexInfo_List["regex_ptss"][0];
                string value_phone    = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0];
                string value_address  = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0];
                string value_datetime = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0];
                string value_hymj     = dicRegexInfo_List["regex_hymj"].Count < 1 ? "" : dicRegexInfo_List["regex_hymj"][0];
                string value_tjg      = dicRegexInfo_List["regex_tjg"].Count < 1 ? "" : dicRegexInfo_List["regex_tjg"][0];
                string value_cwsl     = dicRegexInfo_List["regex_cwsl"].Count < 1 ? "" : dicRegexInfo_List["regex_cwsl"][0];
                string value_dxsmj    = dicRegexInfo_List["regex_dxsmj"].Count < 1 ? "" : dicRegexInfo_List["regex_dxsmj"][0];
                string value_comName  = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0];
                string value_comArea  = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0];
                value_comName  = value_comName.Contains("独立") ? "" : value_comName;
                value_jznd     = 转换建筑年代(value_jznd);
                value_datetime = 转换案例时间(value_datetime);
                //将数据添加到字典
                NewHouse newHouse = new NewHouse(value_lpm, value_datetime, value_xzq, value_pq, "", "", "", value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, value_jzxs, value_hymj, value_tjg, value_cwsl, value_ptss, value_dxsmj, value_comName, value_comArea);

                //当前数据为一天前的数据时
                newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

                //获取刚开始爬取时的小时单位
                int nowH = Convert.ToInt32(nowDate.ToString("HH"));
                if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1))
                    {
                        isNowPageStop = true;
                    }
                }
                else //如果是在12点之后开始 则析取当天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")))
                    {
                        isNowPageStop = true;
                    }
                }
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Exemplo n.º 20
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        public void GetHouseByUrl(string url, string urlPanelHtml)
        {
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("regex_address", regex_address);
                dicRegexInfo.Add("regex_datetime", regex_datetime);
                dicRegexInfo.Add("regex_comName", regex_comName);
                dicRegexInfo.Add("regex_comArea", regex_comArea);
                dicRegexInfo.Add("regex_yt", regex_yt);
                Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >();

                //根据规则获取数据
                dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, "gb2312", dicRegexInfo, WebObj, CityId, timeout: 30000);
                List <string> dateList         = SpiderHelp.GetStrByRegexByIndex(urlPanelHtml, regex_updatetime);
                string        value_title      = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0];
                string        value_lpm        = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0];
                string        value_xzq        = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0];
                string        value_pq         = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0];
                string        value_hx         = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0];
                string        value_mj         = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0];
                string        value_dj         = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0];
                string        value_zj         = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0];
                string        value_jznd       = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0];
                string        value_cx         = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0];
                string        value_szlc       = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0];
                string        value_zlc        = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0];
                string        value_jg         = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0];
                string        value_yt         = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0];
                string        value_zx         = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0];
                string        value_phone      = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0];
                string        value_address    = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0];
                string        value_datetime   = dicRegexInfo_List["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List["regex_datetime"][0];
                string        value_comName    = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0];
                string        value_comArea    = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0];
                string        value_updatetime = dateList.Count < 1 ? "" : dateList[0];
                string        _value_yt        = CheckPurpose(value_yt);
                if (_value_yt == "0")
                {
                    log.Debug(string.Format("GetHouseByUrl()用途无效,url:{0}, cityName:{1},用途:{2}", url, CityName, Convert.ToString(value_yt)));
                    return;
                }
                value_cx = value_cx.Replace("朝", "").TrimBlank();
                value_yt = _value_yt;
                //将数据添加到字典
                NewHouse newHouse = new NewHouse(value_lpm, GetCaseDate(value_datetime, value_updatetime), value_xzq, value_pq, "", "", "", value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea);

                //当前数据为一天前的数据时
                newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

                if (!newHouse.Alsj.CheckStrIsDate())
                {
                    newHouse.Alsj = DateTime.Now.ToString();
                }
                //获取刚开始爬取时的小时单位
                int nowH = Convert.ToInt32(nowDate.ToString("HH"));
                if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1))
                    {
                        isNowPageStop = true;
                    }
                }
                else //如果是在12点之后开始 则析取当天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")))
                    {
                        isNowPageStop = true;
                    }
                }
                //由于类型页面多线程爬取,赞定为永不停止
                isNowPageStop = false;
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Exemplo n.º 21
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        /// <param name="_infoText">详细页对应列表页中的文本</param>
        public void GetHouseByUrl(string url, string _infoText)
        {
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_yt", regex_yt);
                dicRegexInfo.Add("regex_jzxs", regex_jzxs);
                dicRegexInfo.Add("regex_ptss", regex_ptss);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("regex_address", regex_address);
                //dicRegexInfo.Add("regex_datetime", regex_datetime);
                dicRegexInfo.Add("regex_hymj", regex_hymj);
                dicRegexInfo.Add("regex_tjg", regex_tjg);
                dicRegexInfo.Add("regex_cwsl", regex_cwsl);
                dicRegexInfo.Add("regex_dxsmj", regex_dxsmj);
                dicRegexInfo.Add("regex_userId", regex_userId);
                Dictionary <string, RegexInfo> dicRegexInfo2 = new Dictionary <string, RegexInfo>();
                dicRegexInfo2.Add("regex_datetime", regex_datetime);
                //根据规则获取数据
                Dictionary <string, List <string> > dicRegexInfo_List  = SpiderHelp.GetHtmlByRegex(url, "utf-8", dicRegexInfo, WebObj, CityId);
                Dictionary <string, List <string> > dicRegexInfo_List2 = SpiderHelp.GetStrByRegex(_infoText, dicRegexInfo2);
                string value_title    = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0];
                string value_lpm      = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0];
                string value_xzq      = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0];
                string value_pq       = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0];
                string value_hx       = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0];
                string value_mj       = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0];
                string value_dj       = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0];
                string value_zj       = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0];
                string value_jznd     = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0];
                string value_cx       = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0];
                string value_szlc     = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0];
                string value_zlc      = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0];
                string value_jg       = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0];
                string value_zx       = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0];
                string value_yt       = dicRegexInfo_List["regex_yt"].Count < 1 ? "" : dicRegexInfo_List["regex_yt"][0];
                string value_jzxs     = dicRegexInfo_List["regex_jzxs"].Count < 1 ? "" : dicRegexInfo_List["regex_jzxs"][0];
                string value_ptss     = dicRegexInfo_List["regex_ptss"].Count < 1 ? "" : dicRegexInfo_List["regex_ptss"][0];
                string value_phone    = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0];
                string value_address  = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0];
                string value_hymj     = dicRegexInfo_List["regex_hymj"].Count < 1 ? "" : dicRegexInfo_List["regex_hymj"][0];
                string value_tjg      = dicRegexInfo_List["regex_tjg"].Count < 1 ? "" : dicRegexInfo_List["regex_tjg"][0];
                string value_cwsl     = dicRegexInfo_List["regex_cwsl"].Count < 1 ? "" : dicRegexInfo_List["regex_cwsl"][0];
                string value_dxsmj    = dicRegexInfo_List["regex_dxsmj"].Count < 1 ? "" : dicRegexInfo_List["regex_dxsmj"][0];
                string value_userId   = dicRegexInfo_List["regex_userId"].Count < 1 ? "" : dicRegexInfo_List["regex_userId"][0];
                string value_datetime = dicRegexInfo_List2["regex_datetime"].Count < 1 ? "" : dicRegexInfo_List2["regex_datetime"][0];
                //获取中介公司
                string userInfoUrl = "http://user.58.com/userdata?userid={0}&type=10";
                Dictionary <string, RegexInfo> dicRegexInfo3 = new Dictionary <string, RegexInfo>();
                dicRegexInfo3.Add("regex_comName", regex_comName);
                Dictionary <string, List <string> > dicRegexInfo_List3 = SpiderHelp.GetHtmlByRegex(string.Format(userInfoUrl, value_userId), "utf-8", dicRegexInfo3, WebObj, CityId);
                string value_comName = dicRegexInfo_List3["regex_comName"].Count < 1 ? "" : dicRegexInfo_List3["regex_comName"][0];

                //value_jznd = 转换建筑年代(value_jznd);
                value_datetime = 转换案例时间(value_datetime);
                //将数据添加到字典
                NewHouse newHouse = new NewHouse(value_lpm, value_datetime, value_xzq, value_pq, "", "", "", value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, value_jzxs, value_hymj, value_tjg, value_cwsl, value_ptss, value_dxsmj, value_comName, "");

                //当前数据为一天前的数据时
                newHouse.Alsj = newHouse.Alsj != null?newHouse.Alsj.Trim() : newHouse.Alsj;

                //获取刚开始爬取时的小时单位
                int nowH = Convert.ToInt32(nowDate.ToString("HH"));
                if (nowH < 12)//如果是在12点之前开始 则析取昨天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")).AddDays(-1))
                    {
                        isNowPageStop = true;
                    }
                }
                else //如果是在12点之后开始 则析取当天的数据
                {
                    if (Convert.ToDateTime(newHouse.Alsj) < Convert.ToDateTime(nowDate.ToString("yyyy-MM-dd")))
                    {
                        isNowPageStop = true;
                    }
                }
                //由于类型页面多线程爬取,赞定为永不停止
                isNowPageStop = false;
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}--案例时间:{5}--爬取时间:{6}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm, newHouse.Alsj, nowDate));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Exemplo n.º 22
0
        public void Run()
        {
            RegexInfo regex_ipInfo   = new RegexInfo("(<tr class=\"[^\"]*\">(?:(?!</tr>).)*</tr>)", "$1");
            RegexInfo regex_nextPage = new RegexInfo("<a class=\"next_page\" rel=\"next\" href=\"([^\"]+)\">下一页[^<>]*</a>", "$1");
            RegexInfo regex_ip       = new RegexInfo("<tr class=\"[^\"]*\"><td>(?:(?!</td>).)*</td><td>([^<>]+)</td><td>([^<>]+)</td><td><a href=\"[^\"]*\">[^<>]+</a></td>" +
                                                     "<td>[^<>]*</td><td>[^<>]*</td><td>(?:(?!</td>).)*</td><td>(?:(?!</td>).)*</td><td>[^<>]+</td>", "$1:$2");
            RegexInfo regex_area = new RegexInfo("<tr class=\"[^\"]*\"><td>(?:(?!</td>).)*</td><td>[^<>]+</td><td>[^<>]+</td><td><a href=\"[^\"]*\">([^<>]+)</a></td>" +
                                                 "<td>[^<>]*</td><td>[^<>]*</td><td>(?:(?!</td>).)*</td><td>(?:(?!</td>).)*</td><td>[^<>]+</td>", "$1");
            RegexInfo regex_date = new RegexInfo("<tr class=\"[^\"]*\"><td>(?:(?!</td>).)*</td><td>[^<>]+</td><td>[^<>]+</td><td><a href=\"[^\"]*\">[^<>]+</a></td>" +
                                                 "<td>[^<>]*</td><td>[^<>]*</td><td>(?:(?!</td>).)*</td><td>(?:(?!</td>).)*</td><td>([^<>]+)</td>", "$1");
            Dictionary <string, RegexInfo> regexDic1 = new Dictionary <string, RegexInfo>();

            regexDic1.Add("regex_ipInfo", regex_ipInfo);
            regexDic1.Add("regex_nextPage", regex_nextPage);
            Dictionary <string, RegexInfo> regexDic2 = new Dictionary <string, RegexInfo>();

            regexDic2.Add("regex_ip", regex_ip);
            regexDic2.Add("regex_area", regex_area);
            regexDic2.Add("regex_date", regex_date);
            //从文本文件中获取爬取配置
            List <string> urlList    = new List <string>();
            string        configPath = AppDomain.CurrentDomain.BaseDirectory + "IP代理/Config/西刺ProxyIpConfig.txt";
            StreamReader  sr         = new StreamReader(configPath);

            while (true)
            {
                string str = sr.ReadLine();
                if (str == null)
                {
                    break;
                }
                else
                {
                    urlList.Add(str);
                }
            }
            sr.Close();
            sr.Dispose();
            //urlList.Add("http://www.xici.net.co/nt/$http://www.xici.net.co/$2014-5-13");
            try
            {
                SysData_ProxyIp existsObj = new SysData_ProxyIp();
                string          message   = "";
                foreach (string urlInfo in urlList)
                {
                    string   url     = urlInfo.Split('$')[0];
                    string   urlHost = urlInfo.Split('$')[1];
                    DateTime maxDate = Convert.ToDateTime(urlInfo.Split('$')[2]);
requestpage:
                    Dictionary <string, List <string> > dicValueList = SpiderHelp.GetHtmlByRegexNotProxyIp(url, "utf-8", regexDic1);
                    List <string> ipInfoList = dicValueList["regex_ipInfo"];                                                      //所有ip集合
                    string        nextPage   = dicValueList["regex_nextPage"].Count < 1 ? "" : dicValueList["regex_nextPage"][0]; //下一页链接
                    if (ipInfoList == null || ipInfoList.Count < 1)
                    {
                        log.Debug(string.Format("未获取到IP列表,url:{0}", url));
                        continue;
                    }
                    foreach (string ipInfo in ipInfoList)
                    {
                        Dictionary <string, List <string> > infoListDic = SpiderHelp.GetStrByRegex(ipInfo, regexDic2);
                        string   ip     = infoListDic["regex_ip"].Count < 1 ? "" : infoListDic["regex_ip"][0];
                        string   ipArea = infoListDic["regex_area"].Count < 1 ? "" : infoListDic["regex_area"][0];
                        DateTime ipDate = infoListDic["regex_date"].Count < 1 ? DateTime.Now : Convert.ToDateTime(infoListDic["regex_date"][0]);

                        ip = ip.TrimBlank();

                        int result = ProxyIpHelp.ImportProxyIp(ip, ipArea, out existsObj, out message);
                        if (result != 1)
                        {
                            log.Debug(string.Format("{0},url:{1},ip:{2}", message, url, ip == null ? "null" : ip));
                            continue;
                        }
                        else
                        {
                            log.Debug(string.Format("ip插入成功,url:{0},ip:{1}", url, ip == null ? "null" : ip));
                        }
                        //如果自定日期
                        if (ipDate < maxDate)
                        {
                            nextPage = null;
                            break;
                        }
                    }
                    if (!string.IsNullOrEmpty(nextPage))
                    {
                        url = urlHost + nextPage;
                        goto requestpage;
                    }
                }
            }
            catch (Exception ex)
            {
                log.Error("系统异常", ex);
            }
        }
Exemplo n.º 23
0
        public void start()
        {
            RegexInfo 项目JSON信息正则 = new RegexInfo("(\\{[^\\}]*\\})", "$1");
            RegexInfo 项目ID正则     = new RegexInfo("'PARENTPROJID':'([^']*)'", "$1");
            RegexInfo 项目行政区正则    = new RegexInfo("'F_SITE':'([^']*)'", "$1");
            RegexInfo 楼栋JSON信息正则 = new RegexInfo("(\\{[^\\}]*\\})", "$1");
            RegexInfo 楼栋ID正则     = new RegexInfo("'BUILDID':'([^']*)'", "$1");
            RegexInfo 楼栋名称正则     = new RegexInfo("'F_BLOCK':'([^']*)'", "$1");
            RegexInfo 楼盘名项目名正则   = new RegexInfo("\"projectName\":\"([^\"]*)\"", "$1");
            RegexInfo 坐落地址正则     = new RegexInfo("\"location\":\"([^\"]*)\"", "$1");
            Dictionary <string, RegexInfo> 正则集合字典 = new Dictionary <string, RegexInfo>();

            正则集合字典.Add("项目JSON信息", 项目JSON信息正则);
            Dictionary <string, RegexInfo> 正则集合字典_楼栋内页 = new Dictionary <string, RegexInfo>();

            正则集合字典_楼栋内页.Add("楼盘名项目名", 楼盘名项目名正则);
            正则集合字典_楼栋内页.Add("坐落地址", 坐落地址正则);
            string NowEncoding  = "utf-8";
            string 项目列表页URL分页参数 = "http://www.cq315house.com/315web/webservice/GetMyData913.ashx?projectname=&kfs=&projectaddr=&pagesize=10&pageindex={0}&presalecert=";
            string 项目详细页URL参数   = "http://www.cq315house.com/315web/webservice/GetMyData112.ashx?projectId={0}&type=1";
            string 楼栋详细页面URL参数  = "http://www.cq315house.com/315web/HtmlPage/ShowRooms.htm?block={0}&buildingid={1}";
            string 楼栋详细页面URL参数1 = "http://www.cq315house.com/315web/webservice/GetBuildingInfo.ashx?buildingId={0}";

            string 楼盘名         = ""; //
            string 所属县区        = ""; //
            string 地址          = ""; //
            string 预售许可证号_房产证号 = "";
            string 楼栋编号        = "";
            string 单元          = ""; //unitnumber:
            string 义层          = ""; //flr:
            string 房号          = ""; //flr:+x:
            string 跃式          = ""; //rn:
            string 套内面积        = ""; //iArea:
            string 建筑面积        = ""; //bArea:
            string 使用用途        = ""; //use:
            string 户型          = ""; //rType:
            string 拟售单价_套内     = ""; //nsjg:
            string 拟售单价_建面     = ""; //nsjmjg:
            string 建筑结构        = ""; //stru
            string 签约状况        = ""; //F_ISONLINESIGN:(0:未签约)
            string 房屋状态        = ""; //F_ISOWNERSHIP:(0:可售)

            for (long i = 1; i > 0; i++)
            {
                //**********项目列表页信息(获取项目链接)************//
                Dictionary <string, List <string> > 项目列表页结果 = SpiderHelp.GetHtmlByRegex(
                    string.Format(项目列表页URL分页参数, i), "utf-8", 正则集合字典, null, 0);
                List <string> 项目JSON信息List = 项目列表页结果["项目JSON信息"];
                foreach (string 项目json in 项目JSON信息List)
                {
                    List <string> 项目IDList  = SpiderHelp.GetStrByRegexByIndex(项目json, 项目ID正则);
                    List <string> 项目行政区List = SpiderHelp.GetStrByRegexByIndex(项目json, 项目行政区正则);
                    string        项目ID      = 项目IDList.Count > 0 ? 项目IDList[0] : "";
                    所属县区 = 项目行政区List.Count > 0 ? 项目行政区List[0] : "";
                    //*************项目详细页面信息(获取楼栋链接)*************//
                    正则集合字典.Add("楼栋JSON信息", 楼栋JSON信息正则);
                    string 项目详细URL = string.Format(项目详细页URL参数, 项目ID);
                    Dictionary <string, List <string> > 楼栋列表页结果 = SpiderHelp.GetHtmlByRegex(
                        项目详细URL, "utf-8", 正则集合字典, null, 0);
                    List <string> 楼栋JSON信息List = 楼栋列表页结果["楼栋JSON信息"];
                    foreach (string 楼栋json in 楼栋JSON信息List)
                    {
                        List <重庆_楼栋>  楼栋链接信息   = new List <重庆_楼栋>();
                        List <string> 楼栋IDList = SpiderHelp.GetStrByRegexByIndex(楼栋json, 楼栋ID正则);
                        List <string> 楼栋名称List = SpiderHelp.GetStrByRegexByIndex(楼栋json, 楼栋名称正则);

                        string 楼栋ID = 楼栋IDList.Count > 0 ? 楼栋IDList[0] : "";
                        string 楼栋名称 = 楼栋名称List.Count > 0 ? 楼栋名称List[0] : "";
                        if (!string.IsNullOrEmpty(楼栋ID) && !string.IsNullOrEmpty(楼栋名称))
                        {
                            string[] 楼栋ID数组 = 楼栋ID.Split(',');
                            string[] 楼栋名称数组 = 楼栋名称.Split(',');
                            for (int j = 0; j < 楼栋ID数组.Length; j++)
                            {
                                if (j >= 楼栋名称数组.Length)
                                {
                                    break;
                                }
                                if (!string.IsNullOrEmpty(楼栋ID数组[j]))
                                {
                                    重庆_楼栋 obj = new 重庆_楼栋();
                                    obj.楼栋名称 = 楼栋名称数组[j];
                                    obj.楼栋ID = 楼栋ID数组[j];
                                    楼栋链接信息.Add(obj);
                                }
                            }
                        }
                        foreach (重庆_楼栋 楼栋obj in 楼栋链接信息)
                        {
                            //*************楼栋详细页面信息(获取房号信息)*************//
                            string 楼栋URL = string.Format(楼栋详细页面URL参数1, 楼栋obj.楼栋ID);
                            Dictionary <string, List <string> > 楼栋内页信息List = SpiderHelp.GetHtmlByRegex(楼栋URL, "utf-8", 正则集合字典_楼栋内页, null, 0);
                            楼盘名 = 楼栋内页信息List["楼盘名项目名"].Count > 0 ? 楼栋内页信息List["楼盘名项目名"][0] : "";
                            地址  = 楼栋内页信息List["坐落地址"].Count > 0 ? 楼栋内页信息List["坐落地址"][0] : "";
                        }
                    }
                }
            }
        }
Exemplo n.º 24
0
        /// <summary>
        /// 根据列表页url获取详细信息url
        /// </summary>
        /// <param name="hotUrl">列表页域名</param>
        /// <param name="pageListIndexUrl">列表页首页url</param>
        /// <param name="rate">爬取频率(毫秒)</param>
        /// <param name="pageCheckRate">页面监测频率(毫秒)</param>
        /// <param name="下一页链接">输出下一页的链接</param>
        public override void SpiderHouse(string hostUrl, string pageListIndexUrl, int rate, int pageCheckRate)
        {
            //根据开始爬取的日期和当前日期转换url
            pageListIndexUrl = GetSpiderUrlByDate(pageListIndexUrl, nowDate);
            int maxPageCount  = 100;
            int maxPageLength = 100;
            int maxCount      = maxPageCount * maxPageLength;

            if (pageCheckRate > 0)
            {
                System.Threading.Thread.Sleep(pageCheckRate);
            }
            //发布单独爬取详细url的线程方法
            Url_workload = new Queue <string>();
            IsStop       = false;
            Rate         = rate;
            ThreadStart ts2       = new ThreadStart(this.ProcessQueue);
            Thread      m_thread2 = new Thread(ts2);

            m_thread2.Start();
            //开始获取页面
            try
            {
                Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();
                根页面正则字典集合.Add("*总条数", 总条数正则);
                根页面正则字典集合.Add("*片区文本", 片区文本正则);
                Dictionary <string, RegexInfo> 片区链接正则字典 = new Dictionary <string, RegexInfo>();
                片区链接正则字典.Add("*片区链接", 片区链接正则);
                Dictionary <string, RegexInfo> 片区页面正则字典集合 = new Dictionary <string, RegexInfo>();
                片区页面正则字典集合.Add("*总条数", 总条数正则);

                log.Debug(string.Format("SpiderHouse()--获取根页面的总条数,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName));
                Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(pageListIndexUrl, "GBK", 根页面正则字典集合, WebObj, CityId);
                int count = 根页面正则字典集合结果["*总条数"].Count < 1 ? 0 : Convert.ToInt32(根页面正则字典集合结果["*总条数"][0]);
                log.Debug(string.Format("SpiderHouse()--获取根页面的总条数为{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", count.ToString(), hostUrl, pageListIndexUrl, CityName));
                //行政区页面总个数大于10000
                if (count > maxCount)
                {
                    string 片区文本 = 根页面正则字典集合结果["*片区文本"].Count < 1 ? "" : 根页面正则字典集合结果["*片区文本"][0];
                    Dictionary <string, List <string> > 片区链接结果 = SpiderHelp.GetStrByRegex(片区文本, 片区链接正则字典);
                    List <string> 片区链接List = 片区链接结果["*片区链接"];
                    foreach (string _url2 in 片区链接List)
                    {
                        string nowUrl2 = _url2;
                        if (!_url2.ToLower().Contains("http://"))
                        {
                            nowUrl2 = hostUrl + _url2;
                        }

                        //根据开始爬取的日期和当前日期转换url
                        nowUrl2 = GetSpiderUrlByDate(nowUrl2, nowDate);
                        log.Debug(string.Format("SpiderHouse()--获取当前片区页面的总条数,当前链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowUrl2, hostUrl, pageListIndexUrl, CityName));
                        Dictionary <string, List <string> > 片区页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(nowUrl2, "GBK", 片区页面正则字典集合, WebObj, CityId);
                        int _count2 = 片区页面正则字典集合结果["*总条数"].Count < 1 ? 0 : Convert.ToInt32(片区页面正则字典集合结果["*总条数"][0]);
                        log.Debug(string.Format("SpiderHouse()--获取当前片区页面的总条数为{0},当前链接{1},hostUrl:{2}, pageListUrl:{3}, cityName{4}", _count2.ToString(), nowUrl2, hostUrl, pageListIndexUrl, CityName));
                        //*******************片区页面下信息列表爬取***********************//
                        string 片区页面分页链接参数 = nowUrl2.Replace("j3100", "i3{0}-j3100");
                        string 片区页面下一页链接  = nowUrl2;
                        int    当前总页数      = (_count2 - 1) / maxPageLength + 1;
                        int    当前页码       = 1;
                        while (!string.IsNullOrEmpty(片区页面下一页链接))
                        {
                            string nowPageList = 片区页面下一页链接;
                            if (!片区页面下一页链接.ToLower().Contains("http://"))
                            {
                                nowPageList = hostUrl + 片区页面下一页链接;
                            }
                            //根据开始爬取的日期和当前日期转换url
                            nowPageList = GetSpiderUrlByDate(nowPageList, nowDate);
                            log.Debug(string.Format("SpiderHouse()--获取片区页面下一页链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowPageList, hostUrl, pageListIndexUrl, CityName));
                            SpiderHouseByPageListUrl(hostUrl, nowPageList, rate, pageCheckRate, out 片区页面下一页链接);
                            当前页码++;
                            //如果当前页码还不到最后一页&&但返回的下一页链接为null(用于封ip或者网络异常时)
                            if (string.IsNullOrEmpty(片区页面下一页链接) && 当前页码 <= 当前总页数)
                            {
                                片区页面下一页链接 = string.Format(片区页面分页链接参数, 当前页码.ToString());
                            }
                        }
                        log.Debug(string.Format("SpiderHouse()--获取片区页面下信息吸取完成,{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowUrl2, hostUrl, pageListIndexUrl, CityName));
                    }
                }
                else
                {
                    //*******************行政区页面下信息列表爬取***********************//
                    string 行政区页面分页链接参数 = pageListIndexUrl.Replace("j3100", "i3{0}-j3100");
                    string 行政区页面下一页链接  = pageListIndexUrl;
                    int    当前总页数       = (count - 1) / 100 + 1;
                    int    当前页码        = 1;
                    while (!string.IsNullOrEmpty(行政区页面下一页链接))
                    {
                        string nowPageList = 行政区页面下一页链接;
                        if (!行政区页面下一页链接.ToLower().Contains("http://"))
                        {
                            nowPageList = hostUrl + 行政区页面下一页链接;
                        }
                        //根据开始爬取的日期和当前日期转换url
                        nowPageList = GetSpiderUrlByDate(nowPageList, nowDate);
                        log.Debug(string.Format("SpiderHouse()--获取当前行政区页面下一页链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowPageList, hostUrl, pageListIndexUrl, CityName));
                        SpiderHouseByPageListUrl(hostUrl, nowPageList, rate, pageCheckRate, out 行政区页面下一页链接);
                        当前页码++;
                        //如果当前页码还不到最后一页&&但返回的下一页链接为null(用于封ip或者网络异常时)
                        if (string.IsNullOrEmpty(行政区页面下一页链接) && 当前页码 <= 当前总页数)
                        {
                            行政区页面下一页链接 = string.Format(行政区页面分页链接参数, 当前页码.ToString());
                        }
                    }
                    log.Debug(string.Format("SpiderHouse()--获取行政区页面下信息吸取完成,{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", pageListIndexUrl, hostUrl, pageListIndexUrl, CityName));
                }
            }
            catch (Exception ex)
            {
                log.Error(string.Format("SpiderHouse()异常,hostUrl:{0}, pageListIndexUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName), ex);
            }
            log.Debug(string.Format("SpiderHouse()--获取{0}页面下信息吸取完成,详细信息Url吸取完成,详细页面url内容正则析取中-,hostUrl:{1}, pageListUrl:{2}", CityName, hostUrl, pageListIndexUrl));
        }
Exemplo n.º 25
0
        /// <summary>
        /// 根据详细页url获取信息
        /// </summary>
        /// <param name="url">详细页url</param>
        public void GetHouseByUrl(string url)
        {
            //url = "http://sz.esf.sina.com.cn/detail/7946016";
            try
            {
                //设置各字段规则(正则)
                Dictionary <string, RegexInfo> dicRegexInfo = new Dictionary <string, RegexInfo>();
                dicRegexInfo.Add("*regex_lpm", regex_lpm);
                dicRegexInfo.Add("*regex_xzq", regex_xzq);
                dicRegexInfo.Add("regex_pq", regex_pq);
                dicRegexInfo.Add("regex_hx", regex_hx);
                dicRegexInfo.Add("*regex_mj", regex_mj);
                dicRegexInfo.Add("*regex_dj", regex_dj);
                dicRegexInfo.Add("*regex_zj", regex_zj);
                dicRegexInfo.Add("regex_jznd", regex_jznd);
                dicRegexInfo.Add("regex_cx", regex_cx);
                dicRegexInfo.Add("regex_szlc", regex_szlc);
                dicRegexInfo.Add("regex_zlc", regex_zlc);
                dicRegexInfo.Add("regex_jg", regex_jg);
                dicRegexInfo.Add("regex_zx", regex_zx);
                dicRegexInfo.Add("regex_title", regex_title);
                dicRegexInfo.Add("regex_phone", regex_phone);
                dicRegexInfo.Add("regex_address", regex_address);
                dicRegexInfo.Add("regex_updatetime", regex_updatetime);
                dicRegexInfo.Add("regex_comName", regex_comName);
                dicRegexInfo.Add("regex_comArea", regex_comArea);
                Dictionary <string, List <string> > dicRegexInfo_List = new Dictionary <string, List <string> >();

                //根据规则获取数据
                dicRegexInfo_List = SpiderHelp.GetHtmlByRegex(url, NowPageEncoding, dicRegexInfo, WebObj, CityId);
                string value_title      = dicRegexInfo_List["regex_title"].Count < 1 ? "" : dicRegexInfo_List["regex_title"][0];
                string value_lpm        = dicRegexInfo_List["*regex_lpm"].Count < 1 ? "" : dicRegexInfo_List["*regex_lpm"][0];
                string value_xzq        = dicRegexInfo_List["*regex_xzq"].Count < 1 ? "" : dicRegexInfo_List["*regex_xzq"][0];
                string value_pq         = dicRegexInfo_List["regex_pq"].Count < 1 ? "" : dicRegexInfo_List["regex_pq"][0];
                string value_hx         = dicRegexInfo_List["regex_hx"].Count < 1 ? "" : dicRegexInfo_List["regex_hx"][0];
                string value_mj         = dicRegexInfo_List["*regex_mj"].Count < 1 ? "" : dicRegexInfo_List["*regex_mj"][0];
                string value_dj         = dicRegexInfo_List["*regex_dj"].Count < 1 ? "" : dicRegexInfo_List["*regex_dj"][0];
                string value_zj         = dicRegexInfo_List["*regex_zj"].Count < 1 ? "" : dicRegexInfo_List["*regex_zj"][0];
                string value_jznd       = dicRegexInfo_List["regex_jznd"].Count < 1 ? "" : dicRegexInfo_List["regex_jznd"][0];
                string value_cx         = dicRegexInfo_List["regex_cx"].Count < 1 ? "" : dicRegexInfo_List["regex_cx"][0];
                string value_szlc       = dicRegexInfo_List["regex_szlc"].Count < 1 ? "" : dicRegexInfo_List["regex_szlc"][0];
                string value_zlc        = dicRegexInfo_List["regex_zlc"].Count < 1 ? "" : dicRegexInfo_List["regex_zlc"][0];
                string value_jg         = dicRegexInfo_List["regex_jg"].Count < 1 ? "" : dicRegexInfo_List["regex_jg"][0];
                string value_zx         = dicRegexInfo_List["regex_zx"].Count < 1 ? "" : dicRegexInfo_List["regex_zx"][0];
                string value_phone      = dicRegexInfo_List["regex_phone"].Count < 1 ? "" : dicRegexInfo_List["regex_phone"][0];
                string value_address    = dicRegexInfo_List["regex_address"].Count < 1 ? "" : dicRegexInfo_List["regex_address"][0];
                string value_updatetime = dicRegexInfo_List["regex_updatetime"].Count < 1 ? "" : dicRegexInfo_List["regex_updatetime"][0];
                string value_comName    = dicRegexInfo_List["regex_comName"].Count < 1 ? "" : dicRegexInfo_List["regex_comName"][0];
                string value_comArea    = dicRegexInfo_List["regex_comArea"].Count < 1 ? "" : dicRegexInfo_List["regex_comArea"][0];
                //将数据添加到实体
                NewHouse newHouse = new NewHouse(value_lpm, value_updatetime, value_xzq, value_pq, "", "", "", value_mj, value_dj,
                                                 "", value_jg, "", value_zj, value_szlc, value_zlc, value_hx, value_cx, value_zx, value_jznd,
                                                 value_title, value_phone, url, "", 网站名称, value_address, "", "", "", "", "", "", value_comName, value_comArea);
                //保存数据
                SaveNowData(newHouse);
                log.Debug(string.Format("{0}-数据保存完成url:{1}--cityname:{2}--value_title:{3}--value_lpm{4}", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), url, CityName, value_title, value_lpm));
            }
            catch (Exception ex)
            {
                log.Error(string.Format("GetHouseByUrl()异常,url:{0}, cityName:{1}", url, CityName), ex);
            }
        }
Exemplo n.º 26
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="hostUrl"></param>
        /// <param name="pageListIndexUrl"></param>
        /// <param name="rate"></param>
        /// <param name="pageCheckRate"></param>
        public void SpiderHouse(string hostUrl, string pageListIndexUrl, int rate, int pageCheckRate)
        {
            //RegexInfo 总条数正则 = new RegexInfo("<p class=\"fr mt3 pr10\">共找到<strong class=\"c_red\">([\\d]+)</strong>套", "$1");
            //总条数正则.RegexInfoList.Add(new RegexInfo("<div class=\"search_main_list_tit_num\">共找到<span>([\\d]+)</span>套", "$1"));
            //总条数正则.RegexInfoList.Add(new RegexInfo(">共找到<em class=\"c_red mlr5\">([\\d]+)</em>条房源<", "$1"));
            #region 生成xml
            //StringBuilder stest = new StringBuilder();
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_lpm, "regex_lpm", "楼盘名"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_xzq, "regex_xzq", "行政区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_pq, "regex_pq", "片区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_hx, "regex_hx", "户型")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_mj, "regex_mj", "面积")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_dj, "regex_dj", "单价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zj, "regex_zj", "总价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jznd, "regex_jznd", "建筑年代")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_cx, "regex_cx", "朝向")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_szlc, "regex_szlc", "所在楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zlc, "regex_zlc", "总楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jg, "regex_jg", "结构")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zx, "regex_zx", "装修")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_title, "regex_title", "信息")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_phone, "regex_phone", "电话")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_address, "regex_address", "地址")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_updatetime, "regex_updatetime", "更新时间")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comName, "regex_comName", "公司")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comArea, "regex_comArea", "门店")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_infUrl, "regex_infUrl", "url")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_nextPage, "regex_nextPage", "下一页正则"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(总条数正则, "总条数正则", "总条数正则"));
            //string str = stest.ToString();
            #endregion

            int maxPageLength = 50;

            if (pageCheckRate > 0)
            {
                System.Threading.Thread.Sleep(pageCheckRate);
            }
            //获取爬取记录
            //新浪Log =新浪二手房LogManager.获取新浪二手房_Log(CityName);
            //if (新浪Log == null)
            //{
            //    新浪Log = 新浪二手房LogManager.初始化新浪二手房_Log(CityName);
            //}
            //发布单独爬取详细url的线程方法
            Url_workload = new Queue <string>();
            IsStop       = false;
            Rate         = rate;
            ThreadStart ts2       = new ThreadStart(this.ProcessQueue);
            Thread      m_thread2 = new Thread(ts2);
            m_thread2.Start();
            //开始获取页面
            Dictionary <string, RegexInfo> 正则字典集合 = new Dictionary <string, RegexInfo>();
            正则字典集合.Add("*总条数", 总条数正则);
            log.Debug(string.Format("新浪二手房SpiderHouse()--获取根页面的总条数,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName));
            Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(pageListIndexUrl, NowPageEncoding, 正则字典集合, WebObj, CityId);
            int count = 根页面正则字典集合结果["*总条数"].Count < 1 ? 0 : Convert.ToInt32(根页面正则字典集合结果["*总条数"][0]);
            log.Debug(string.Format("新浪二手房SpiderHouse()--获取根页面的总条数为{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", count.ToString(), hostUrl, pageListIndexUrl, CityName));


            //*******************根页面下信息列表爬取***********************//
            string 根页面分页链接参数 = pageListIndexUrl.Replace("p0-m2", "p0-m2-n{0}");
            根页面分页链接参数 = 根页面分页链接参数.Replace("p0-m3", "p0-m3-n{0}");
            string 根页面下一页链接 = pageListIndexUrl;
            int    当前总页数    = (count - 1) / maxPageLength + 1;
            int    当前页码     = 1;
            //if (StringHelp.获取相差天数(新浪Log.开始爬取时间) < 1 && 新浪Log.当前列表页面页码 != null)
            //{
            //    当前页码 = Convert.ToInt32(新浪Log.当前列表页面页码);
            //    根页面下一页链接 = string.Format(根页面分页链接参数, 当前页码.ToString());
            //}
            //else
            //{
            //    新浪Log.开始爬取时间 = DateTime.Now;
            //    新浪二手房LogManager.设置Log(新浪Log);
            //}
            while (!string.IsNullOrEmpty(根页面下一页链接))
            {
                string nowPageList = 根页面下一页链接;
                if (!根页面下一页链接.ToLower().Contains("http://"))
                {
                    nowPageList = hostUrl + 根页面下一页链接;
                }
                log.Debug(string.Format("新浪二手房SpiderHouse()--获取根页面下一页链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowPageList, hostUrl, pageListIndexUrl, CityName));
                SpiderHouseByPageListUrl(hostUrl, nowPageList, rate, pageCheckRate, out 根页面下一页链接);
                if (!string.IsNullOrEmpty(根页面下一页链接))
                {
                    //新浪Log.当前列表页面页码 = 当前页码;
                }
                //新浪二手房LogManager.设置Log(新浪Log);
                当前页码++;
                //如果当前页码还不到最后一页&&但返回的下一页链接为null(用于封ip或者网络异常时)
                if (string.IsNullOrEmpty(根页面下一页链接) && 当前页码 <= 当前总页数)
                {
                    根页面下一页链接 = string.Format(根页面分页链接参数, 当前页码.ToString());
                }
            }
            log.Debug(string.Format("新浪二手房SpiderHouse()--获取{0}页面下详细信息Url吸取完成,详细页面url内容正则析取中-,hostUrl:{1}, pageListUrl:{2}", CityName, hostUrl, pageListIndexUrl));
            IsStop = true;
        }
Exemplo n.º 27
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="hostUrl"></param>
        /// <param name="pageListIndexUrl"></param>
        /// <param name="rate"></param>
        /// <param name="pageCheckRate"></param>
        public void SpiderHouse(string hostUrl, string pageListIndexUrl, int rate, int pageCheckRate)
        {
            #region 生成xml
            //StringBuilder stest = new StringBuilder();
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_lpm, "regex_lpm", "楼盘名"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_xzq, "regex_xzq", "行政区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_pq, "regex_pq", "片区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_yt, "regex_yt", "用途")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_hx, "regex_hx", "户型")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_mj, "regex_mj", "面积")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_dj, "regex_dj", "单价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zj, "regex_zj", "总价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jznd, "regex_jznd", "建筑年代")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_cx, "regex_cx", "朝向")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_szlc, "regex_szlc", "所在楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zlc, "regex_zlc", "总楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jg, "regex_jg", "结构")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zx, "regex_zx", "装修")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_title, "regex_title", "信息")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_phone, "regex_phone", "电话")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_address, "regex_address", "地址")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_datetime, "regex_datetime", "发布时间")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_updatetime, "regex_updatetime", "更新时间")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comName, "regex_comName", "公司")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comArea, "regex_comArea", "门店")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_infUrl, "regex_infUrl", "url")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_infPanel, "regex_infPanel", "列表页详细页面链接所在区域块html")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_nextPage, "regex_nextPage", "下一页正则"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(总条数正则, "总条数正则", "总条数正则"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(行政区文本正则, "行政区文本正则", "行政区文本正则")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(行政区链接正则, "行政区链接正则", "行政区链接正则")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(片区文本正则, "片区文本正则", "片区文本正则")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(片区链接正则, "片区链接正则", "片区链接正则")); ;
            //string str = stest.ToString();
            #endregion
            int maxPageCount  = 500;
            int maxPageLength = 10;
            int maxCount      = maxPageCount * maxPageLength;

            if (pageCheckRate > 0)
            {
                System.Threading.Thread.Sleep(pageCheckRate);
            }
            //开始获取页面
            try
            {
                Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();
                根页面正则字典集合.Add("*总条数", 总条数正则);
                根页面正则字典集合.Add("*行政区文本", 行政区文本正则);
                Dictionary <string, RegexInfo> 行政区链接字典集合 = new Dictionary <string, RegexInfo>();
                行政区链接字典集合.Add("*行政区链接", 行政区链接正则);
                Dictionary <string, RegexInfo> 行政区页面正则字典集合 = new Dictionary <string, RegexInfo>();
                行政区页面正则字典集合.Add("*总条数", 总条数正则);
                行政区页面正则字典集合.Add("*片区文本", 片区文本正则);
                Dictionary <string, RegexInfo> 片区链接字典集合 = new Dictionary <string, RegexInfo>();
                片区链接字典集合.Add("*片区链接", 片区链接正则);
                Dictionary <string, RegexInfo> 片区页面正则字典集合 = new Dictionary <string, RegexInfo>();
                片区页面正则字典集合.Add("*总条数", 总条数正则);

                log.Debug(string.Format("常州房产网SpiderHouse()--获取根页面的总条数,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName));
                Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(pageListIndexUrl, "gb2312", 根页面正则字典集合, WebObj, CityId, timeout: 30000);
                int count = 根页面正则字典集合结果["*总条数"].Count < 1 ? 0 : Convert.ToInt32(根页面正则字典集合结果["*总条数"][0]);
                log.Debug(string.Format("常州房产网SpiderHouse()--获取根页面的总条数为{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", count.ToString(), hostUrl, pageListIndexUrl, CityName));
                //当前根页面总个数大于最大值
                string 行政区文本 = 根页面正则字典集合结果["*行政区文本"].Count < 1 ? "" : 根页面正则字典集合结果["*行政区文本"][0];
                Dictionary <string, List <string> > 行政区链接结果 = SpiderHelp.GetStrByRegex(行政区文本, 行政区链接字典集合);
                List <string> 行政区链接List = 行政区链接结果["*行政区链接"];
                if (count > maxCount && 行政区链接List.Count > 0)
                {
                    if (rate > 0)
                    {
                        System.Threading.Thread.Sleep(rate);
                    }
                    foreach (string _url in 行政区链接List)
                    {
                        if (_url.Contains("/sale_0_"))
                        {
                            continue;
                        }
                        isNowPageStop = false;
                        string nowUrl = _url;
                        if (!_url.ToLower().Contains("http://"))
                        {
                            nowUrl = hostUrl + _url;
                        }
                        //个数获取
                        log.Debug(string.Format("常州房产网SpiderHouse()--获取当前行政区页面的总条数,当前链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowUrl, hostUrl, pageListIndexUrl, CityName));
                        Dictionary <string, List <string> > 行政区页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(nowUrl, "gb2312", 行政区页面正则字典集合, WebObj, CityId, timeout: 30000);
                        int _count = 行政区页面正则字典集合结果["*总条数"].Count < 1 ? 0 : Convert.ToInt32(行政区页面正则字典集合结果["*总条数"][0]);
                        log.Debug(string.Format("常州房产网SpiderHouse()--获取当前行政区页面的总条数为{0},当前链接{1},hostUrl:{2}, pageListUrl:{3}, cityName{4}", _count.ToString(), nowUrl, hostUrl, pageListIndexUrl, CityName));

                        //当前行政区页面总个数大于最大值
                        string 片区文本 = 行政区页面正则字典集合结果["*片区文本"].Count < 1 ? "" : 行政区页面正则字典集合结果["*片区文本"][0];
                        Dictionary <string, List <string> > 片区链接结果 = SpiderHelp.GetStrByRegex(片区文本, 片区链接字典集合);
                        List <string> 片区链接List = 片区链接结果["*片区链接"];
                        if (_count > maxCount && 片区链接List.Count > 0)
                        {
                            //获取片区下信息
                            foreach (string _url2 in 片区链接List)
                            {
                                #region (片区下爬取)
                                isNowPageStop = false;
                                string nowUrl2 = _url2;
                                if (!_url2.ToLower().Contains("http://"))
                                {
                                    nowUrl2 = hostUrl + _url2;
                                }
                                //获取个数
                                log.Debug(string.Format("常州房产网SpiderHouse()--获取当前片区页面的总条数,当前链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowUrl2, hostUrl, pageListIndexUrl, CityName));
                                Dictionary <string, List <string> > 片区页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(nowUrl2, "gb2312", 片区页面正则字典集合, WebObj, CityId, timeout: 30000);
                                int _count2 = 片区页面正则字典集合结果["*总条数"].Count < 1 ? 0 : Convert.ToInt32(片区页面正则字典集合结果["*总条数"][0]);
                                log.Debug(string.Format("常州房产网SpiderHouse()--获取当前片区页面的总条数为{0},当前链接{1},hostUrl:{2}, pageListUrl:{3}, cityName{4}", _count2.ToString(), nowUrl2, hostUrl, pageListIndexUrl, CityName));
                                //当前片区页面总条数大于最大值
                                //*******************片区页面下信息列表爬取***********************//
                                SpiderPage_Thread(maxPageCount, maxPageLength, hostUrl, nowUrl2, pageListIndexUrl, rate, pageCheckRate, _count2, "片区");
                                #endregion
                            }
                        }
                        else
                        {
                            //*******************行政区页面下信息列表爬取***********************//
                            SpiderPage_Thread(maxPageCount, maxPageLength, hostUrl, nowUrl, pageListIndexUrl, rate, pageCheckRate, _count, "行政区");
                        }
                    }
                }
                else
                {
                    //*******************根页面下信息列表爬取***********************//
                    SpiderPage_Thread(maxPageCount, maxPageLength, hostUrl, pageListIndexUrl, pageListIndexUrl, rate, pageCheckRate, count, "根");
                }
            }
            catch (Exception ex)
            {
                log.Error(string.Format("常州房产网SpiderHouse()异常,hostUrl:{0}, pageListIndexUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName), ex);
            }
            log.Debug(string.Format("常州房产网SpiderHouse()--获取{0}页面下详细信息Url吸取完成,详细页面url内容正则析取中-,hostUrl:{1}, pageListUrl:{2}", CityName, hostUrl, pageListIndexUrl));
        }
Exemplo n.º 28
0
        public virtual void SpiderHouse(string hostUrl, string pageListIndexUrl, int rate, int pageCheckRate)
        {
            #region 生成xml
            //StringBuilder stest = new StringBuilder();
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_lpm, "regex_lpm", "楼盘名"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_xzq, "regex_xzq", "行政区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_pq, "regex_pq", "片区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_yt, "regex_yt", "用途")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_hx, "regex_hx", "户型")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_mj, "regex_mj", "面积")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_dj, "regex_dj", "单价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zj, "regex_zj", "总价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jznd, "regex_jznd", "建筑年代")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_cx, "regex_cx", "朝向")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_szlc, "regex_szlc", "所在楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zlc, "regex_zlc", "总楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jg, "regex_jg", "结构")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zx, "regex_zx", "装修")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_title, "regex_title", "信息")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_phone, "regex_phone", "电话")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_address, "regex_address", "地址")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_datetime, "regex_datetime", "发布时间")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comName, "regex_comName", "公司")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comArea, "regex_comArea", "门店")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_infUrl, "regex_infUrl", "url")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_nextPage, "regex_nextPage", "下一页正则"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jzxs, "regex_jzxs", "建筑形式"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_ptss, "regex_ptss", "配套设施"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_hymj, "regex_hymj", "花园面积"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_tjg, "regex_tjg", "厅结构"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_cwsl, "regex_cwsl", "车位数量"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_dxsmj, "regex_dxsmj", "地下室面积"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(总条数正则, "总条数正则", "总条数正则"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(行政区链接正则, "行政区链接正则", "行政区链接正则")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(片区链接正则, "片区链接正则", "片区链接正则")); ;
            //string str = stest.ToString();
            #endregion
            int maxPageCount  = 313;
            int maxPageLength = 41;
            int maxCount      = maxPageCount * maxPageLength;
            if (pageCheckRate > 0)
            {
                System.Threading.Thread.Sleep(pageCheckRate);
            }
            //开始获取页面
            try
            {
                Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();
                根页面正则字典集合.Add("*总条数", 总条数正则);
                根页面正则字典集合.Add("*行政区链接", 行政区链接正则);
                Dictionary <string, RegexInfo> 行政区页面正则字典集合 = new Dictionary <string, RegexInfo>();
                行政区页面正则字典集合.Add("*片区链接", 片区链接正则);
                行政区页面正则字典集合.Add("*总条数", 总条数正则);
                Dictionary <string, RegexInfo> 片区页面正则字典集合 = new Dictionary <string, RegexInfo>();
                片区页面正则字典集合.Add("*总条数", 总条数正则);

                //Dictionary<string, RegexInfo> 正则字典集合 = new Dictionary<string, RegexInfo>();
                //正则字典集合.Add("总条数", 总条数正则);
                //正则字典集合.Add("行政区链接", 行政区链接正则);
                //正则字典集合.Add("片区链接", 片区链接正则);
                log.Debug(string.Format(网站名称 + "SpiderHouse()--获取根页面的总条数,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName));
                Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(pageListIndexUrl, "utf-8", 根页面正则字典集合, WebObj, CityId);
                int count = 根页面正则字典集合结果["*总条数"].Count < 1 ? 0 : Convert.ToInt32(根页面正则字典集合结果["*总条数"][0]);
                log.Debug(string.Format(网站名称 + "SpiderHouse()--获取根页面的总条数为{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", count.ToString(), hostUrl, pageListIndexUrl, CityName));
                //当前根页面总个数大于最大值
                List <string> 行政区链接List = 根页面正则字典集合结果["*行政区链接"];
                if (count > maxCount && 行政区链接List.Count > 0)
                {
                    if (rate > 0)
                    {
                        System.Threading.Thread.Sleep(rate);
                    }
                    foreach (string _url in 行政区链接List)
                    {
                        isNowPageStop = false;
                        string nowUrl = _url;
                        if (!_url.ToLower().Contains("http://"))
                        {
                            nowUrl = hostUrl + _url;
                        }
                        //个数获取
                        log.Debug(string.Format(网站名称 + "SpiderHouse()--获取当前行政区页面的总条数,当前链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowUrl, hostUrl, pageListIndexUrl, CityName));
                        Dictionary <string, List <string> > 行政区页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(nowUrl, "utf-8", 行政区页面正则字典集合, WebObj, CityId);
                        int _count = 行政区页面正则字典集合结果["*总条数"].Count < 1 ? 0 : Convert.ToInt32(行政区页面正则字典集合结果["*总条数"][0]);
                        log.Debug(string.Format(网站名称 + "SpiderHouse()--获取当前行政区页面的总条数为{0},当前链接{1},hostUrl:{2}, pageListUrl:{3}, cityName{4}", _count.ToString(), nowUrl, hostUrl, pageListIndexUrl, CityName));

                        List <string> 片区链接List = 行政区页面正则字典集合结果["*片区链接"];
                        //当前行政区页面总个数大于最大值
                        if (_count > maxCount && 片区链接List.Count > 0)
                        {
                            foreach (string _url2 in 片区链接List)
                            {
                                isNowPageStop = false;
                                string nowUrl2 = _url2;
                                if (!_url2.ToLower().Contains("http://"))
                                {
                                    nowUrl2 = hostUrl + _url2;
                                }
                                //获取个数
                                log.Debug(string.Format(网站名称 + "SpiderHouse()--获取当前片区页面的总条数,当前链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowUrl2, hostUrl, pageListIndexUrl, CityName));
                                Dictionary <string, List <string> > 片区页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(nowUrl2, "utf-8", 片区页面正则字典集合, WebObj, CityId);
                                int _count2 = 片区页面正则字典集合结果["*总条数"].Count < 1 ? 0 : Convert.ToInt32(片区页面正则字典集合结果["*总条数"][0]);
                                log.Debug(string.Format(网站名称 + "SpiderHouse()--获取当前片区页面的总条数为{0},当前链接{1},hostUrl:{2}, pageListUrl:{3}, cityName{4}", _count2.ToString(), nowUrl2, hostUrl, pageListIndexUrl, CityName));

                                //*******************片区页面下信息列表爬取***********************//
                                string 片区页面分页链接参数 = nowUrl2 + "o{0}/";
                                string 片区页面下一页链接  = nowUrl2;
                                int    当前总页数      = (_count2 - 1) / maxPageLength + 1;
                                int    当前页码       = 1;

                                while (!string.IsNullOrEmpty(片区页面下一页链接))
                                {
                                    string nowPageList = 片区页面下一页链接;
                                    if (!片区页面下一页链接.ToLower().Contains("http://"))
                                    {
                                        nowPageList = hostUrl + 片区页面下一页链接;
                                    }
                                    log.Debug(string.Format(网站名称 + "SpiderHouse()--获取片区页面下一页链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowPageList, hostUrl, pageListIndexUrl, CityName));
                                    SpiderHouseByPageListUrl(hostUrl, nowPageList, rate, pageCheckRate, out 片区页面下一页链接);
                                    当前页码++;
                                    //如果当前页码还不到最后一页&&但返回的下一页链接为null(用于封ip或者网络异常时)
                                    if (string.IsNullOrEmpty(片区页面下一页链接) && 当前页码 <= 当前总页数)
                                    {
                                        片区页面下一页链接 = string.Format(片区页面分页链接参数, 当前页码.ToString());
                                    }
                                    if (isNowPageStop)
                                    {
                                        break;
                                    }
                                }
                                log.Debug(string.Format(网站名称 + "SpiderHouse()--获取片区页面下信息吸取完成,{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowUrl2, hostUrl, pageListIndexUrl, CityName));
                            }
                        }
                        else
                        {
                            //*******************行政区页面下信息列表爬取***********************//
                            string 行政区页面分页链接参数 = nowUrl + "o{0}/";
                            string 行政区页面下一页链接  = nowUrl;
                            int    当前总页数       = (_count - 1) / maxPageLength + 1;
                            int    当前页码        = 1;
                            while (!string.IsNullOrEmpty(行政区页面下一页链接))
                            {
                                string nowPageList = 行政区页面下一页链接;
                                if (!行政区页面下一页链接.ToLower().Contains("http://"))
                                {
                                    nowPageList = hostUrl + 行政区页面下一页链接;
                                }
                                log.Debug(string.Format(网站名称 + "SpiderHouse()--获取行政区页面下一页链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowPageList, hostUrl, pageListIndexUrl, CityName));
                                SpiderHouseByPageListUrl(hostUrl, nowPageList, rate, pageCheckRate, out 行政区页面下一页链接);
                                当前页码++;
                                //如果当前页码还不到最后一页&&但返回的下一页链接为null(用于封ip或者网络异常时)
                                if (string.IsNullOrEmpty(行政区页面下一页链接) && 当前页码 <= 当前总页数)
                                {
                                    行政区页面下一页链接 = string.Format(行政区页面分页链接参数, 当前页码.ToString());
                                }
                                if (isNowPageStop)
                                {
                                    break;
                                }
                            }
                            log.Debug(string.Format(网站名称 + "SpiderHouse()--获取行政区页面下信息吸取完成,{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowUrl, hostUrl, pageListIndexUrl, CityName));
                        }
                    }
                }
                else
                {
                    //*******************根页面下信息列表爬取***********************//
                    string 根页面分页链接参数 = pageListIndexUrl + "o{0}/";
                    string 根页面下一页链接  = pageListIndexUrl;
                    int    当前总页数     = (count - 1) / maxPageLength + 1;
                    int    当前页码      = 1;
                    while (!string.IsNullOrEmpty(根页面下一页链接))
                    {
                        string nowPageList = 根页面下一页链接;
                        if (!根页面下一页链接.ToLower().Contains("http://"))
                        {
                            nowPageList = hostUrl + 根页面下一页链接;
                        }
                        log.Debug(string.Format(网站名称 + "SpiderHouse()--获取根页面下一页链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowPageList, hostUrl, pageListIndexUrl, CityName));
                        SpiderHouseByPageListUrl(hostUrl, nowPageList, rate, pageCheckRate, out 根页面下一页链接);
                        当前页码++;
                        //如果当前页码还不到最后一页&&但返回的下一页链接为null(用于封ip或者网络异常时)
                        if (string.IsNullOrEmpty(根页面下一页链接) && 当前页码 <= 当前总页数)
                        {
                            根页面下一页链接 = string.Format(根页面分页链接参数, 当前页码.ToString());
                        }
                        if (isNowPageStop)
                        {
                            break;
                        }
                    }
                    log.Debug(string.Format(网站名称 + "SpiderHouse()--获取根页面下信息吸取完成,{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", 根页面下一页链接, hostUrl, pageListIndexUrl, CityName));
                }
            }
            catch (Exception ex)
            {
                log.Error(string.Format("" + 网站名称 + "SpiderHouse()异常,hostUrl:{0}, pageListIndexUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName), ex);
            }
            log.Debug(string.Format("" + 网站名称 + "SpiderHouse()--获取{0}页面下详细信息Url吸取完成,详细页面url内容正则析取中-,hostUrl:{1}, pageListUrl:{2}", CityName, hostUrl, pageListIndexUrl));
            IsStop = true;
        }
Exemplo n.º 29
0
        public void Run()
        {
            //ip列表页ip信息
            RegexInfo regex_ipInfo = new RegexInfo("(\\&nbsp\\;[^<>]+<br />)", "$1");
            //ip列表页ip
            RegexInfo regex_ip = new RegexInfo("\\&nbsp\\;([^<>\\s]+) ([^<>\\s]+) [^<>]+<br />", "$1:$2");
            //ip列表页ip区域
            RegexInfo regex_area = new RegexInfo("\\&nbsp\\;[^<>\\s]+ [^<>\\s]+ ([^<>\\s]+) [^<>]+<br />", "$1");
            Dictionary <string, RegexInfo> regexDic1 = new Dictionary <string, RegexInfo>();

            regexDic1.Add("regex_ipInfo", regex_ipInfo);
            Dictionary <string, RegexInfo> regexDic2 = new Dictionary <string, RegexInfo>();

            regexDic2.Add("regex_ip", regex_ip);
            regexDic2.Add("regex_area", regex_area);
            //从文本文件中获取爬取ip分类页面列表
            List <string> urlList    = new List <string>();
            string        configPath = AppDomain.CurrentDomain.BaseDirectory + "IP代理/Config/猫扑网IP代理ProxyIpConfig.txt";
            StreamReader  sr         = new StreamReader(configPath);

            while (true)
            {
                string str = sr.ReadLine();
                if (str == null)
                {
                    break;
                }
                else
                {
                    urlList.Add(str);
                }
            }
            sr.Close();
            sr.Dispose();
            //但从当前页面中获取爬取ip分类列表
            RegexInfo regex_urllist                  = new RegexInfo("<DT><a href=\"([^\"]+)\" target=\"_blank\">[^<>]+</a></DT>", "$1");
            RegexInfo regex_urllist_pagecount        = new RegexInfo("<a href=\"http://www.itmop.com/proxy/catalog.asp\\?page=(\\d+)\">\\&raquo\\;</a></DIV>", "$1");
            Dictionary <string, RegexInfo> regexDic3 = new Dictionary <string, RegexInfo>();

            regexDic3.Add("regex_urllist", regex_urllist);
            regexDic3.Add("regex_urllist_pagecount", regex_urllist_pagecount);
            int    urllist_index         = 1;
            int    urllist_max_index     = 1;
            string urllist_nextPage_para = "http://www.itmop.com/proxy/catalog.asp?page={0}";   //下一页
            string urllist_nextPage      = string.Format(urllist_nextPage_para, urllist_index); //下一页

            try
            {
begin_nextpage:
                //获取ip列表页url
                Dictionary <string, List <string> > dicValueList3 = SpiderHelp.GetHtmlByRegexNotProxyIp(urllist_nextPage, "utf-8", regexDic3);
                List <string> urlList2 = dicValueList3["regex_urllist"];//所有ip列表集合
                urlList.AddRange(urlList2);
                //获取ip列表列表页的最大页数
                urllist_max_index = dicValueList3["regex_urllist_pagecount"].Count < 1 ? 0 : Convert.ToInt32(dicValueList3["regex_urllist_pagecount"][0].TrimBlank());
                if (urllist_index < urllist_max_index)
                {
                    urllist_index    = urllist_index + 1;
                    urllist_nextPage = string.Format(urllist_nextPage_para, urllist_index);
                }
                else
                {
                    urllist_nextPage = "";
                }
                //开始爬取当前页列表ip页面
                SysData_ProxyIp existsObj = new SysData_ProxyIp();
                string          message   = "";
                for (int i = 0; i < urlList.Count(); i++)
                {
                    string urlInfo = urlList[i];
                    urlInfo = urlInfo.Replace("&amp;", "&");
                    string url     = urlInfo.Split('$')[0];
                    string urlHost = urlInfo.Split('$').Length < 2 ? "http://www.itmop.com" : urlInfo.Split('$')[1];
requestpage:
                    //根据ip列表页url爬取ip信息
                    if (!url.ToLower().Contains("http://"))
                    {
                        url = urlHost + url;
                    }
                    Dictionary <string, List <string> > dicValueList = SpiderHelp.GetHtmlByRegexNotProxyIp(url, "utf-8", regexDic1);
                    List <string> ipInfoList = dicValueList["regex_ipInfo"];//所有ip集合
                    string        nextPage   = "";
                    if (ipInfoList == null || ipInfoList.Count < 1)
                    {
                        log.Debug(string.Format("未获取到IP列表,url:{0}", url));
                        continue;
                    }
                    foreach (string ipInfo in ipInfoList)
                    {
                        Dictionary <string, List <string> > infoListDic = SpiderHelp.GetStrByRegex(ipInfo, regexDic2);
                        string ip     = infoListDic["regex_ip"].Count < 1 ? "" : infoListDic["regex_ip"][0];
                        string ipArea = infoListDic["regex_area"].Count < 1 ? "" : infoListDic["regex_area"][0];
                        ipArea = ipArea.RemoveHeml();
                        ip     = ip.TrimBlank();
                        int result = ProxyIpHelp.ImportProxyIp(ip, ipArea, out existsObj, out message);
                        if (result != 1)
                        {
                            log.Debug(string.Format("{0},url:{1},ip:{2}", message, url, ip == null ? "null" : ip));
                            continue;
                        }
                        else
                        {
                            log.Debug(string.Format("ip插入成功,url:{0},ip:{1}", url, ip == null ? "null" : ip));
                        }
                    }
                    if (!string.IsNullOrEmpty(nextPage))
                    {
                        url = urlHost + nextPage;
                        goto requestpage;
                    }
                }
                if (!string.IsNullOrEmpty(urllist_nextPage))
                {
                    urlList = new List <string>();
                    goto begin_nextpage;
                }
            }
            catch (Exception ex)
            {
                log.Error("系统异常", ex);
            }
        }
Exemplo n.º 30
0
        public void SpiderHouse(string hostUrl, string pageListIndexUrl, int rate, int pageCheckRate)
        {
            #region 生成xml
            //StringBuilder stest = new StringBuilder();
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_lpm, "regex_lpm", "楼盘名"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_xzq, "regex_xzq", "行政区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_pq, "regex_pq", "片区")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_yt, "regex_yt", "用途")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_hx, "regex_hx", "户型")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_mj, "regex_mj", "面积")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_dj, "regex_dj", "单价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zj, "regex_zj", "总价")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jznd, "regex_jznd", "建筑年代")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_cx, "regex_cx", "朝向")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_szlc, "regex_szlc", "所在楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zlc, "regex_zlc", "总楼层")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jg, "regex_jg", "结构")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zx, "regex_zx", "装修")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_title, "regex_title", "信息")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_phone, "regex_phone", "电话")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_address, "regex_address", "地址")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_datetime, "regex_datetime", "发布时间")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comName, "regex_comName", "公司")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_comArea, "regex_comArea", "门店")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_infUrl, "regex_infUrl", "url")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_nextPage, "regex_nextPage", "下一页正则"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jzxs, "regex_jzxs", "建筑形式"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_listinfo, "regex_listinfo", "列表信息"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_ptss, "regex_ptss", "配套设施"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_hymj, "regex_hymj", "花园面积"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_tjg, "regex_tjg", "厅结构"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_cwsl, "regex_cwsl", "车位数量"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_dxsmj, "regex_dxsmj", "地下室面积"));
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(行政区文本正则, "行政区文本正则", "行政区文本正则")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(行政区链接正则, "行政区链接正则", "行政区链接正则")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(片区文本正则, "片区文本正则", "片区文本正则")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(片区链接正则, "片区链接正则", "片区链接正则")); ;
            //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_userId, "regex_userId", "用户ID(网站独有)")); ;
            //string str = stest.ToString();
            #endregion
            int maxPageCount  = 70;
            int maxPageLength = 92;
            int maxCount      = maxPageCount * maxPageLength;
            if (pageCheckRate > 0)
            {
                System.Threading.Thread.Sleep(pageCheckRate);
            }
            //开始获取页面
            try
            {
                Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();
                根页面正则字典集合.Add("*行政区文本", 行政区文本正则);
                Dictionary <string, RegexInfo> 行政区链接字典集合 = new Dictionary <string, RegexInfo>();
                行政区链接字典集合.Add("*行政区链接", 行政区链接正则);
                Dictionary <string, RegexInfo> 行政区页面正则字典集合 = new Dictionary <string, RegexInfo>();
                行政区页面正则字典集合.Add("*片区文本", 片区文本正则);
                Dictionary <string, RegexInfo> 片区链接字典集合 = new Dictionary <string, RegexInfo>();
                片区链接字典集合.Add("*片区链接", 片区链接正则);



                //Dictionary<string, RegexInfo> 正则字典集合 = new Dictionary<string, RegexInfo>();
                //正则字典集合.Add("行政区文本", 行政区文本正则);
                //正则字典集合.Add("行政区链接", 行政区链接正则);
                //正则字典集合.Add("片区文本", 片区文本正则);
                //正则字典集合.Add("片区链接", 片区链接正则);
                log.Debug(string.Format(网站名称 + "SpiderHouse()--获取根页面的总条数,hostUrl:{0}, pageListUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName));
                Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(pageListIndexUrl, "utf-8", 根页面正则字典集合, WebObj, CityId);
                //当前根页面片区个数大于0
                string 行政区文本 = 根页面正则字典集合结果["*行政区文本"].Count < 1 ? "" : 根页面正则字典集合结果["*行政区文本"][0];
                Dictionary <string, List <string> > 行政区链接结果 = SpiderHelp.GetStrByRegex(行政区文本, 行政区链接字典集合);
                List <string> 行政区链接List = 行政区链接结果["*行政区链接"];
                log.Debug(string.Format(网站名称 + "SpiderHouse()--获取根页面的行政区数为{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", 行政区链接List.Count.ToString(), hostUrl, pageListIndexUrl, CityName));
                if (行政区链接List.Count > 0)
                {
                    if (rate > 0)
                    {
                        System.Threading.Thread.Sleep(rate);
                    }
                    foreach (string _url in 行政区链接List)
                    {
                        isNowPageStop = false;
                        string nowUrl = _url;
                        if (!_url.ToLower().Contains("http://"))
                        {
                            nowUrl = hostUrl + _url;
                        }
                        //当前行政区页面片区个数
                        log.Debug(string.Format(网站名称 + "SpiderHouse()--获取当前行政区页面的总条数,当前链接{0},hostUrl:{1}, pageListUrl:{2}, cityName{3}", nowUrl, hostUrl, pageListIndexUrl, CityName));
                        Dictionary <string, List <string> > 行政区页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(nowUrl, "utf-8", 行政区页面正则字典集合, WebObj, CityId);
                        //当前行政区页面片区个数>0
                        string 片区文本 = 行政区页面正则字典集合结果["*片区文本"].Count < 1 ? "" : 行政区页面正则字典集合结果["*片区文本"][0];
                        Dictionary <string, List <string> > 片区链接结果 = SpiderHelp.GetStrByRegex(片区文本, 片区链接字典集合);
                        List <string> 片区链接List = 片区链接结果["*片区链接"];
                        log.Debug(string.Format(网站名称 + "SpiderHouse()--获取当前行政区页面的片区个数为{0},当前链接{1},hostUrl:{2}, pageListUrl:{3}, cityName{4}", 片区链接List.Count.ToString(), nowUrl, hostUrl, pageListIndexUrl, CityName));
                        //当前行政区页面片区个数大于0
                        if (片区链接List.Count > 0)
                        {
                            foreach (string _url2 in 片区链接List)
                            {
                                isNowPageStop = false;
                                string nowUrl2 = _url2;
                                if (!_url2.ToLower().Contains("http://"))
                                {
                                    nowUrl2 = hostUrl + _url2;
                                }
                                //*******************片区页面下信息列表爬取***********************//
                                SpiderPage_Thread(maxPageCount, maxPageLength, hostUrl, nowUrl2, pageListIndexUrl, rate, pageCheckRate, "片区");
                            }
                        }
                        else
                        {
                            //*******************行政区页面下信息列表爬取***********************//
                            SpiderPage_Thread(maxPageCount, maxPageLength, hostUrl, nowUrl, pageListIndexUrl, rate, pageCheckRate, "行政区");
                        }
                    }
                }
                else
                {
                    //*******************根页面下信息列表爬取***********************//
                    SpiderPage_Thread(maxPageCount, maxPageLength, hostUrl, pageListIndexUrl, pageListIndexUrl, rate, pageCheckRate, "根");
                }
            }
            catch (Exception ex)
            {
                log.Error(string.Format("" + 网站名称 + "SpiderHouse()异常,hostUrl:{0}, pageListIndexUrl:{1}, cityName{2}", hostUrl, pageListIndexUrl, CityName), ex);
            }
            log.Debug(string.Format("" + 网站名称 + "SpiderHouse()--获取{0}页面下详细信息Url吸取完成,详细页面url内容正则析取中-,hostUrl:{1}, pageListUrl:{2}", CityName, hostUrl, pageListIndexUrl));
            IsStop = true;
        }