コード例 #1
0
        public void start()
        {
            NowPageEncoding = "gbk";
            regex_lpm.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_lpm", 网站名称, "NewDataSpider2"));
            regex_xzq.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_xzq", 网站名称, "NewDataSpider2"));
            regex_pq.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_pq", 网站名称, "NewDataSpider2"));
            regex_mj.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_mj", 网站名称, "NewDataSpider2"));
            regex_dj.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_dj", 网站名称, "NewDataSpider2"));
            regex_zj.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_zj", 网站名称, "NewDataSpider2"));
            regex_szlc.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_szlc", 网站名称, "NewDataSpider2"));
            regex_zlc.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_zlc", 网站名称, "NewDataSpider2"));
            regex_hx.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_hx", 网站名称, "NewDataSpider2"));
            regex_cx.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_cx", 网站名称, "NewDataSpider2"));
            regex_zx.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_zx", 网站名称, "NewDataSpider2"));
            regex_jznd.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_jznd", 网站名称, "NewDataSpider2"));
            regex_title.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_title", 网站名称, "NewDataSpider2"));
            regex_phone.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_phone", 网站名称, "NewDataSpider2"));
            regex_infUrl.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_infUrl", 网站名称, "NewDataSpider2"));
            regex_address.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_address", 网站名称, "NewDataSpider2"));
            regex_nextPage.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_nextPage", 网站名称, "NewDataSpider2"));

            网站爬取配置 obj = SpiderWebConfigManager.根据城市获取新浪二手房爬取配置("上海");

            if (obj != null)
            {
                NewDataRum newDataRum = new NewDataRum("上海", obj.域名, obj.列表页链接, obj.详细页面爬取频率, obj.列表页面爬取频率);
                newDataRum.start(this);
            }
        }
コード例 #2
0
        /// <summary>
        ///
        /// </summary>
        public void start()
        {
            //new 深圳().start();
            //new 广州().start();
            new  海().start();
            new 北京().start();
            //new 贵阳().start();
            //new 哈尔滨().start();
            //new 海口().start();
            //new 合肥().start();
            //new 呼和浩特().start();
            //new 兰州().start();
            //new 南宁().start();
            //new 石家庄().start();
            //new 太原().start();
            //new 西宁().start();
            //new 银川().start();
            //new 长春().start();
            //new 郑州().start();
            //new 重庆().start();
            //new 昆明().start();
            new 天津().start();
            List <VIEW_网站爬取配置_城市表_网站表> list = SpiderWebConfigManager.获取新浪二手房下所有城市爬取配置();

            foreach (VIEW_网站爬取配置_城市表_网站表 _view in list)
            {
                NewDataRum exists = NewDataRumList.Find(
                    delegate(NewDataRum _newDataRum) { return(_newDataRum.CityName.Equals(_view.城市名称)); });
                if (exists == null)
                {
                    new 其他城市(_view.城市名称).start(_view.域名, _view.列表页链接, _view.详细页面爬取频率, _view.列表页面爬取频率);
                }
            }
        }
コード例 #3
0
ファイル: 其他城市.cs プロジェクト: kingshhh/fxtcode
        public void start()
        {
            网站爬取配置 obj = SpiderWebConfigManager.根据城市获取赶集网爬取配置(CityName);

            if (obj != null)
            {
                start(obj.域名, obj.列表页链接, obj.详细页面爬取频率, obj.列表页面爬取频率);
            }
        }
コード例 #4
0
ファイル: 银川.cs プロジェクト: kingshhh/fxtcode
        public void start()
        {
            网站爬取配置 obj = SpiderWebConfigManager.根据城市获取新浪二手房爬取配置("银川");

            if (obj != null)
            {
                NewDataRum newDataRum = new NewDataRum("银川", obj.域名, obj.列表页链接, obj.详细页面爬取频率, obj.列表页面爬取频率);
                newDataRum.start(this);
            }
        }
コード例 #5
0
ファイル: 海口.cs プロジェクト: kingshhh/fxtcode
        public void start()
        {
            List <网站爬取配置> objlist = SpiderWebConfigManager.根据城市获取搜房网爬取配置("海口");

            foreach (网站爬取配置 obj in objlist)
            {
                NewDataRum newDataRum = new NewDataRum("海口", obj.域名, obj.列表页链接, obj.详细页面爬取频率, obj.列表页面爬取频率, obj.规则编号, obj.主要用途, obj.主要案例类型);
                newDataRum.start(this);
            }
        }
コード例 #6
0
        public void start()
        {
            List <网站爬取配置> objlist = SpiderWebConfigManager.根据城市获取搜房网爬取配置(CityName);

            foreach (网站爬取配置 obj in objlist)
            {
                其他城市 runObj = new 其他城市();
                runObj.CityName = CityName;
                start(runObj, obj.域名, obj.列表页链接, obj.详细页面爬取频率, obj.列表页面爬取频率, obj.规则编号, obj.主要用途, obj.主要案例类型);
            }
        }
コード例 #7
0
        public void start()
        {
            RegexInfo 总条数正则          = new RegexInfo("共找到<strong class=\"number orange\">([\\d]*)</strong>条", "$1");
            RegexInfo cityRegexInfo  = new RegexInfo("<div class=\"onCont\" id=\"c01\"[^<>]*>((?:(?!</div>).)*)</div>", "$1");
            RegexInfo cityRegexInfo2 = new RegexInfo("(<a href=\"[^\"]+\"[^<>]*>[^<>]+</a>)", "$1");
            Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>();

            cityRegexDic.Add("城市列表Text", cityRegexInfo);
            Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://soufun.com/SoufunFamily.htm", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.搜房网_ID), CityId);
            string cityText = dicCitylistText["城市列表Text"].Count > 0 ? dicCitylistText["城市列表Text"][0] : "";

            cityRegexDic.Add("城市列表", cityRegexInfo2);
            Dictionary <string, List <string> > dicCitylist = SpiderHelp.GetStrByRegex(cityText, cityRegexDic);
            List <string> cityList = dicCitylist["城市列表"];
            StringBuilder citySb   = new StringBuilder();

            cityRegexDic.Add("总条数", 总条数正则);
            List <string> list2 = new List <string>();

            foreach (string cityInfoStr in cityList)
            {
                RegexInfo regexCityName = new RegexInfo("<a href=\"[^\"]+\"[^<>]*>([^<>]+)</a>", "$1");
                RegexInfo regexCityUrl  = new RegexInfo("<a href=\"([^\"]+)\"[^<>]*>[^<>]+</a>", "$1");
                Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>();
                cityRegexDic2.Add("regexCityName", regexCityName);
                cityRegexDic2.Add("regexCityUrl", regexCityUrl);
                Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2);
                string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : "");
                string cityUrl  = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : "";
                string execStr  = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}";

                城市表 city = CityManager.Get城市_byLike城市名称(cityName);
                if (city != null && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.搜房网_ID) == null)//&&
                {
                    string houseUrl1 = cityUrl.Replace("http://", "http://esf.").TrimEnd('/');
                    if (city.城市名称.Contains("北京"))
                    {
                        houseUrl1 = houseUrl1.Replace("bj.", "");
                    }
                    string houseUrl2 = houseUrl1 + "/house/h316-j3100-w32/";
                    Dictionary <string, List <string> > dicCountlistText = SpiderHelp.GetHtmlByRegex(houseUrl2, "gbk", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.搜房网_ID), CityId);
                    string count = dicCountlistText["总条数"].Count > 0 ? dicCountlistText["总条数"][0] : "";
                    if (!string.IsNullOrEmpty(count))
                    {
                        execStr = string.Format(execStr, WebsiteManager.搜房网, city.城市名称, houseUrl1, houseUrl2,
                                                "", "4000", "2000");
                        citySb.Append(execStr).Append("\r\n");
                        list2.Add(execStr);
                    }
                }
            }
            string result = citySb.ToString();
        }
コード例 #8
0
ファイル: NewDataSpider.cs プロジェクト: kingshhh/fxtcode
        public void start()
        {
            List <VIEW_网站爬取配置_城市表_网站表> list = SpiderWebConfigManager.获取常州房产网下所有城市爬取配置();

            foreach (VIEW_网站爬取配置_城市表_网站表 _view in list)
            {
                NewDataRum exists = NewDataRumList.Find(
                    delegate(NewDataRum _newDataRum) { return(_newDataRum.CityName.Equals(_view.城市名称)); });
                if (exists == null)
                {
                    new 其他城市(_view.城市名称).start(_view.域名, _view.列表页链接, _view.详细页面爬取频率, _view.列表页面爬取频率);
                }
            }
        }
コード例 #9
0
        public void start()
        {
            //GetHouseByUrl("http://lw.58.com/ershoufang/16131733324033x.shtml", "");
            //new 其他城市("深圳").start("http://sz.58.com", "http://sz.58.com/ershoufang/", 2000, 1000);
            //return;
            List <VIEW_网站爬取配置_城市表_网站表> list = SpiderWebConfigManager.获取五八同城下所有城市爬取配置();

            foreach (VIEW_网站爬取配置_城市表_网站表 _view in list)
            {
                NewDataRum exists = NewDataRumList.Find(
                    delegate(NewDataRum _newDataRum) { return(_newDataRum.CityName.Equals(_view.城市名称)); });
                if (exists == null)
                {
                    new 其他城市(_view.城市名称).start(_view.域名, _view.列表页链接, _view.详细页面爬取频率, _view.列表页面爬取频率);
                }
            }
        }
コード例 #10
0
        public void start()
        {
            RegexInfo cityRegexInfo  = new RegexInfo("<dl id=\"clist\">((?:(?!</dl>).)*)</dl>", "$1");
            RegexInfo cityRegexInfo2 = new RegexInfo("(<a href=\"[^\"]+\" onclick=\"co\\([^\"]+\">[^<>]+</a>)", "$1");
            Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>();

            cityRegexDic.Add("城市列表Text", cityRegexInfo);
            Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.58.com/ershoufang/changecity/", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.五八同城_ID), CityId);
            string cityText = dicCitylistText["城市列表Text"].Count > 0 ? dicCitylistText["城市列表Text"][0] : "";

            cityRegexDic.Add("城市列表", cityRegexInfo2);
            Dictionary <string, List <string> > dicCitylist = SpiderHelp.GetStrByRegex(cityText, cityRegexDic);
            List <string> cityList = dicCitylist["城市列表"];
            StringBuilder citySb   = new StringBuilder();

            foreach (string cityInfoStr in cityList)
            {
                RegexInfo regexCityName = new RegexInfo("<a href=\"[^\"]+\" onclick=\"co\\([^\"]+\">([^<>]+)</a>", "$1");
                RegexInfo regexCityUrl  = new RegexInfo("<a href=\"([^\"]+)\" onclick=\"co\\([^\"]+\">[^<>]+</a>", "$1");
                Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>();
                cityRegexDic2.Add("regexCityName", regexCityName);
                cityRegexDic2.Add("regexCityUrl", regexCityUrl);
                Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2);
                string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : "");
                string cityUrl  = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : "";
                string execStr  = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}";

                城市表 city = CityManager.Get城市_byLike城市名称(cityName);
                if (city != null && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.五八同城_ID) == null)
                {
                    execStr = string.Format(execStr, WebsiteManager.五八同城, city.城市名称, cityUrl.Replace("/ershoufang", ""), cityUrl.TrimEnd('/') + "/",
                                            "", "2000", "2000");
                    citySb.Append(execStr).Append("\r\n");
                }
            }
            string result = citySb.ToString();

            导出任务计划配置文件();
        }
コード例 #11
0
        public void start()
        {
            网站表       webObj = WebsiteManager.GetWebById(WebsiteManager.城市房产_ID);
            RegexInfo 总页数正则  = new RegexInfo("<div class=\"[^\"]*\"><span class='fl mr'>\\d+/(\\d+)</span>", "$1");
            Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();

            根页面正则字典集合.Add("总页数", 总页数正则);
            RegexInfo cityRegexInfo = new RegexInfo("(<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>[^<>]+</a>)", "$1");
            //cityRegexInfo.RegexInfoList.Add(new RegexInfo("(<a[^<>]+href='http\\://[^\\.]+.cityhouse.cn'[^<>]*>[^<>]+</a>)", "$1"));
            Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>();

            cityRegexDic.Add("城市列表", cityRegexInfo);
            Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.cityhouse.cn/city.html", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.城市房产_ID), CityId);
            List <string> cityList = dicCitylistText["城市列表"];
            StringBuilder citySb   = new StringBuilder();
            StringBuilder citySb2  = new StringBuilder();

            foreach (string cityInfoStr in cityList)
            {
                RegexInfo regexCityName = new RegexInfo("<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>([^<>]+)</a>", "$1");
                RegexInfo regexCityUrl  = new RegexInfo("<a[^<>]+href=[\",']{1,1}(http\\://[^\\.]+.cityhouse.cn)[\",']{1,1}[^<>]*>[^<>]+</a>", "$1");
                Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>();
                cityRegexDic2.Add("regexCityName", regexCityName);
                cityRegexDic2.Add("regexCityUrl", regexCityUrl);
                Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2);
                string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : "");
                string cityUrl  = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : "";
                string execStr  = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}";

                城市表 city = CityManager.Get城市_byLike城市名称(cityName);
                if (city != null && !citySb2.ToString().Contains(city.城市名称) && !citySb.ToString().Contains(city.城市名称) && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.城市房产_ID) == null)
                {
                    Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(cityUrl + "/forsale/flist.html?ob=10", "utf-8", 根页面正则字典集合, webObj, CityId, referer: cityUrl + "/forsale/flist.html?ob=10");

                    execStr = string.Format(execStr, WebsiteManager.城市房产, city.城市名称, cityUrl, cityUrl + "/forsale/flist.html?ob=10",
                                            "", "2000", "2000");
                    if (根页面正则字典集合结果["总页数"].Count() < 1)
                    {
                        citySb2.Append(execStr).Append("\r\n");
                        continue;
                    }
                    citySb.Append(execStr).Append("\r\n");
                }
            }
            string result  = citySb.ToString();
            string result2 = citySb2.ToString();

            导出任务计划配置文件();
        }
コード例 #12
0
ファイル: 北京.cs プロジェクト: kingshhh/fxtcode
 public void start()
 {
     NowPageEncoding = "gbk";
     //regex_lpm.RegexInfoList.Add(new RegexInfo("<div class=\"approve_price\">小区名称:<strong>(<a [^<>]*>([^<>]*)</a>|([^<>]+))</strong>", "$2$3"));
     //regex_xzq.RegexInfoList.Add(new RegexInfo("<div class=\"approve_price\">小区名称:<strong>(?:(?!</strong>).)*</strong> (<strong>([^<>\\-]*)\\-\\-[^<>\\-]*</strong>)", "$1"));
     //regex_pq.RegexInfoList.Add(new RegexInfo("<div class=\"approve_price\">小区名称:<strong>(?:(?!</strong>).)*</strong> (<strong>[^<>\\-]*\\-\\-([^<>\\-]*)</strong>)", "$1"));
     //regex_mj.RegexInfoList.Add(new RegexInfo("<li class=\"w400\"><strong>面积:([\\d\\.]*)㎡</strong></li>", "$1"));
     //regex_dj.RegexInfoList.Add(new RegexInfo("<li class=\"w240\"><strong>单价:([\\d\\.]*)元/㎡</strong>", "$1"));
     //regex_zj.RegexInfoList.Add(new RegexInfo("<li class=\"w240\"><strong>总价:([\\d\\.]*)万元</strong></li>", "$1"));
     //regex_szlc.RegexInfoList.Add(new RegexInfo("<li class=\"w240\">楼层:第([\\d]*)层(共[\\d]*层)</li>", "$1"));
     //regex_zlc.RegexInfoList.Add(new RegexInfo("<li class=\"w240\">楼层:第[\\d\\-]*层(共([\\d]*)层)</li>", "$1"));
     //regex_hx.RegexInfoList.Add(new RegexInfo("<li class=\"w400\"><strong>户型:([^<>]*)</strong></li>", "$1"));
     //regex_cx.RegexInfoList.Add(new RegexInfo("<li class=\"w240\">朝向:([^<>]*)</li>", "$1"));
     //regex_zx.RegexInfoList.Add(new RegexInfo("<li class=\"w400\">装修:([^<>]*)</li>", "$1"));
     //regex_jznd.RegexInfoList.Add(new RegexInfo("<li class=\"w400\">建筑年代:(\\d*)</li>", "$1"));
     //regex_title.RegexInfoList.Add(new RegexInfo("<div class=\"approvecot_titbg\"><span>([^<>]*)</span></div>", "$1"));
     //regex_phone.RegexInfoList.Add(new RegexInfo("<span class=\"telbg\"><strong>([^<>]*)</strong></span>", "$1"));
     //regex_infUrl.RegexInfoList.Add(new RegexInfo("<div class=\"search_font_line1_tit_list\"><a href=\"([^\"]*)\" target=\"_blank\" title=\"[^\"]*\">[^<>]*</a></div>", "$1"));
     //regex_address.RegexInfoList.Add(new RegexInfo("<li class=\"w650\">地址:([^<>]*)</li>", "$1"));
     //regex_nextPage.RegexInfoList.Add(new RegexInfo("<a href=\"([^\"]+)\" class=\"nextpage\">下一页</a>", "$1"));
     regex_lpm.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_lpm", 网站名称, cityName: "NewDataSpider2"));
     regex_xzq.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_xzq", 网站名称, cityName: "NewDataSpider2"));
     regex_pq.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_pq", 网站名称, cityName: "NewDataSpider2"));
     regex_mj.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_mj", 网站名称, cityName: "NewDataSpider2"));
     regex_dj.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_dj", 网站名称, cityName: "NewDataSpider2"));
     regex_zj.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_zj", 网站名称, cityName: "NewDataSpider2"));
     regex_szlc.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_szlc", 网站名称, cityName: "NewDataSpider2"));
     regex_zlc.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_zlc", 网站名称, cityName: "NewDataSpider2"));
     regex_hx.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_hx", 网站名称, cityName: "NewDataSpider2"));
     regex_cx.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_cx", 网站名称, cityName: "NewDataSpider2"));
     regex_zx.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_zx", 网站名称, cityName: "NewDataSpider2"));
     regex_jznd.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_jznd", 网站名称, cityName: "NewDataSpider2"));
     regex_title.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_title", 网站名称, cityName: "NewDataSpider2"));
     regex_phone.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_phone", 网站名称, cityName: "NewDataSpider2"));
     regex_infUrl.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_infUrl", 网站名称, cityName: "NewDataSpider2"));
     regex_address.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_address", 网站名称, cityName: "NewDataSpider2"));
     regex_nextPage.RegexInfoList.Add(SpiderRegexInfoHelp.GetRegexInfoByXmlName("regex_nextPage", 网站名称, cityName: "NewDataSpider2"));
     #region 生成xml
     //StringBuilder stest = new StringBuilder();
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_lpm.RegexInfoList[regex_lpm.RegexInfoList.Count-1], "regex_lpm", "楼盘名"));
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_xzq.RegexInfoList[regex_xzq.RegexInfoList.Count - 1], "regex_xzq", "行政区")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_pq.RegexInfoList[regex_pq.RegexInfoList.Count - 1], "regex_pq", "片区")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_hx.RegexInfoList[regex_hx.RegexInfoList.Count - 1], "regex_hx", "户型")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_mj.RegexInfoList[regex_mj.RegexInfoList.Count - 1], "regex_mj", "面积")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_dj.RegexInfoList[regex_dj.RegexInfoList.Count - 1], "regex_dj", "单价")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zj.RegexInfoList[regex_zj.RegexInfoList.Count - 1], "regex_zj", "总价")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_jznd.RegexInfoList[regex_jznd.RegexInfoList.Count - 1], "regex_jznd", "建筑年代")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_cx.RegexInfoList[regex_cx.RegexInfoList.Count - 1], "regex_cx", "朝向")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_szlc.RegexInfoList[regex_szlc.RegexInfoList.Count - 1], "regex_szlc", "所在楼层")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zlc.RegexInfoList[regex_zlc.RegexInfoList.Count - 1], "regex_zlc", "总楼层")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_zx.RegexInfoList[regex_zx.RegexInfoList.Count - 1], "regex_zx", "装修")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_title.RegexInfoList[regex_title.RegexInfoList.Count - 1], "regex_title", "信息")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_phone.RegexInfoList[regex_phone.RegexInfoList.Count - 1], "regex_phone", "电话")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_address.RegexInfoList[regex_address.RegexInfoList.Count - 1], "regex_address", "地址")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_infUrl.RegexInfoList[regex_infUrl.RegexInfoList.Count - 1], "regex_infUrl", "url")); ;
     //stest.Append(SpiderRegexInfoHelp.GetRegexInfoXmlByObj(regex_nextPage.RegexInfoList[regex_nextPage.RegexInfoList.Count - 1], "regex_nextPage", "下一页正则"));
     //string str = stest.ToString();
     #endregion
     网站爬取配置 obj = SpiderWebConfigManager.根据城市获取新浪二手房爬取配置("北京");
     if (obj != null)
     {
         NewDataRum newDataRum = new NewDataRum("北京", obj.域名, obj.列表页链接, obj.详细页面爬取频率, obj.列表页面爬取频率);
         newDataRum.start(this);
     }
 }
コード例 #13
0
        public void start()
        {
            string[] citys = new string[] { "济宁", "杭州", "漳州", "威海", "宜昌", "北海", "包头", "滨州", "长春", "大连", "东营",
                                            "衡水", "湖州", "金华", "九江", "盐城", "嘉兴", "聊城", "芜湖", "临沂", "南通", "秦皇岛",
                                            "衢州", "日照", "上海", "石家庄", "绍兴", "泰安", "潍坊", "襄阳", "银川", "烟台", "淄博", "镇江" };
            网站表       webObj = WebsiteManager.GetWebById(WebsiteManager.中国房产超市_ID);
            RegexInfo 总页数正则  = new RegexInfo(">下一页</a><div[^<>]*><span[^<>]*>[^<>]*</span>/([^<>]+)<", "$1");
            Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();

            根页面正则字典集合.Add("总页数", 总页数正则);
            RegexInfo cityRegexListText = new RegexInfo("<span class=\"hui6c\"[^<>]*>((?:(?!</span).)*)</span>", "$1");
            RegexInfo cityRegexInfo     = new RegexInfo("(<a [^<>]+>[^<>]+</a>)", "$1");
            Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>();

            cityRegexDic.Add("城市列表文本", cityRegexListText);
            //获取所有城市的文本区域
            Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.fccs.com/city/", "gb2312", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.中国房产超市_ID), CityId);
            string cityText = dicCitylistText["城市列表文本"].Count < 1 ? "" : dicCitylistText["城市列表文本"][0];

            //从文本区域中提取城市列表
            cityRegexDic.Add("城市列表", cityRegexInfo);
            dicCitylistText = SpiderHelp.GetStrByRegex(cityText, cityRegexDic);
            List <string> cityList = dicCitylistText["城市列表"];
            StringBuilder citySb   = new StringBuilder();
            StringBuilder citySb2  = new StringBuilder();
            StringBuilder citySb3  = new StringBuilder();

            foreach (string cityInfoStr in cityList)
            {
                //获取城市名称+url
                RegexInfo regexCityName = new RegexInfo("<a [^<>]+>([^<>]+)</a>", "$1");
                RegexInfo regexCityUrl  = new RegexInfo("<a[^<>]+href=\"([^<>]+)\"[^<>]*>[^<>]+</a>", "$1");
                Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>();
                cityRegexDic2.Add("regexCityName", regexCityName);
                cityRegexDic2.Add("regexCityUrl", regexCityUrl);
                Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2);
                string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : "");
                string cityUrl  = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : "";
                //判断城市是否需要爬取
                if (citys.Where(obj => obj.Equals(cityName)).FirstOrDefault() == null)
                {
                    continue;
                }
                //生成sql
                string execStr = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}";
                城市表    city    = CityManager.Get城市_byLike城市名称(cityName);
                if (city != null && !citySb2.ToString().Contains(city.城市名称) && !citySb.ToString().Contains(city.城市名称) && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.中国房产超市_ID) == null)
                {
                    string host     = cityUrl.Replace("http://", "http://second.");
                    string hostlist = host;
                    Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(hostlist, "gb2312", 根页面正则字典集合, webObj, CityId);

                    execStr = string.Format(execStr, WebsiteManager.中国房产超市, city.城市名称, host, hostlist,
                                            "", "1", "1");
                    if (根页面正则字典集合结果["总页数"].Count() < 1)
                    {
                        citySb2.Append(execStr).Append("\r\n");
                        continue;
                    }
                    citySb.Append(execStr).Append("\r\n");
                }
            }
            string        result  = citySb.ToString();
            string        result2 = citySb2.ToString();
            string        result3 = citySb3.ToString();
            List <string> test    = citys.Where(obj => !result.Contains(obj)).ToList();

            导出任务计划配置文件();
        }