示例#1
0
        private void 绑定所有网站()
        {
            List <网站表> list = new List <网站表>();// WebsiteManager.所有网站;
            网站表        obj  = new 网站表 {
                ID = -1, 网站名称 = "全部"
            };

            list.Insert(0, obj);
            cb网站.DisplayMember = "网站名称";
            cb网站.ValueMember   = "ID";
            //cb网站.DataSource = list;
        }
示例#2
0
        /// <summary>
        /// 根据网站名称获取ID
        /// </summary>
        /// <param name="webName"></param>
        /// <returns></returns>
        public static int GetWebIdByWebName(string webName)
        {
            int webId = 0;

            using (DataClassesDataContext db = new DataClassesDataContext())
            {
                网站表 website = 所有网站.Find(delegate(网站表 _website) { return(!string.IsNullOrEmpty(webName) && webName.Equals(_website.网站名称)); });
                if (website != null)
                {
                    webId = website.ID;
                }
            }
            return(webId);
        }
示例#3
0
        private void cboxCity_SelectedIndexChanged(object sender, EventArgs e)
        {
            string selectCityId = Convert.ToString(cboxCity.SelectedValue);

            if (!string.IsNullOrEmpty(selectCityId) && StringHelp.IsInteger(selectCityId))
            {
                List <网站表> list = WebsiteManager.GetWebByCityId(Convert.ToInt32(selectCityId));
                网站表        obj  = new 网站表 {
                    ID = -1, 网站名称 = "全部"
                };
                list.Insert(0, obj);
                cb网站.DataSource = list;
            }
        }
        public void start()
        {
            网站表       webObj = WebsiteManager.GetWebById(WebsiteManager.城市房产_ID);
            RegexInfo 总页数正则  = new RegexInfo("<div class=\"[^\"]*\"><span class='fl mr'>\\d+/(\\d+)</span>", "$1");
            Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();

            根页面正则字典集合.Add("总页数", 总页数正则);
            RegexInfo cityRegexInfo = new RegexInfo("(<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>[^<>]+</a>)", "$1");
            //cityRegexInfo.RegexInfoList.Add(new RegexInfo("(<a[^<>]+href='http\\://[^\\.]+.cityhouse.cn'[^<>]*>[^<>]+</a>)", "$1"));
            Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>();

            cityRegexDic.Add("城市列表", cityRegexInfo);
            Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.cityhouse.cn/city.html", "utf-8", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.城市房产_ID), CityId);
            List <string> cityList = dicCitylistText["城市列表"];
            StringBuilder citySb   = new StringBuilder();
            StringBuilder citySb2  = new StringBuilder();

            foreach (string cityInfoStr in cityList)
            {
                RegexInfo regexCityName = new RegexInfo("<a[^<>]+href=[\",']{1,1}http\\://[^\\.]+.cityhouse.cn[\",']{1,1}[^<>]*>([^<>]+)</a>", "$1");
                RegexInfo regexCityUrl  = new RegexInfo("<a[^<>]+href=[\",']{1,1}(http\\://[^\\.]+.cityhouse.cn)[\",']{1,1}[^<>]*>[^<>]+</a>", "$1");
                Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>();
                cityRegexDic2.Add("regexCityName", regexCityName);
                cityRegexDic2.Add("regexCityUrl", regexCityUrl);
                Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2);
                string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : "");
                string cityUrl  = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : "";
                string execStr  = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}";

                城市表 city = CityManager.Get城市_byLike城市名称(cityName);
                if (city != null && !citySb2.ToString().Contains(city.城市名称) && !citySb.ToString().Contains(city.城市名称) && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.城市房产_ID) == null)
                {
                    Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(cityUrl + "/forsale/flist.html?ob=10", "utf-8", 根页面正则字典集合, webObj, CityId, referer: cityUrl + "/forsale/flist.html?ob=10");

                    execStr = string.Format(execStr, WebsiteManager.城市房产, city.城市名称, cityUrl, cityUrl + "/forsale/flist.html?ob=10",
                                            "", "2000", "2000");
                    if (根页面正则字典集合结果["总页数"].Count() < 1)
                    {
                        citySb2.Append(execStr).Append("\r\n");
                        continue;
                    }
                    citySb.Append(execStr).Append("\r\n");
                }
            }
            string result  = citySb.ToString();
            string result2 = citySb2.ToString();

            导出任务计划配置文件();
        }
示例#5
0
        static WebsiteManager()
        {
            所有网站      = GetAllWebsite();
            搜房网_ID    = 0;
            安居客_ID    = 0;
            新浪二手房_ID  = 0;
            黄石信息港_ID  = 0;
            住在九江_ID   = 0;
            城市房产_ID   = 0;
            河源置业网_ID  = 0;
            邯郸恋家网_ID  = 0;
            常州房产网_ID  = 0;
            楼盘网_ID    = 0;
            搜狐二手房_ID  = 0;
            满堂红地产网_ID = 0;
            置家网_ID    = 0;
            中国房产超市_ID = 0;
            中原地产_ID   = 0;
            网站表 obj1 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(搜房网)); });

            if (obj1 != null)
            {
                搜房网_ID = obj1.ID;
            }
            网站表 obj2 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(安居客)); });

            if (obj2 != null)
            {
                安居客_ID = obj2.ID;
            }
            网站表 obj3 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(新浪二手房)); });

            if (obj3 != null)
            {
                新浪二手房_ID = obj3.ID;
            }
            网站表 obj4 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(赶集网)); });

            if (obj4 != null)
            {
                赶集网_ID = obj4.ID;
            }
            网站表 obj5 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(五八同城)); });

            if (obj5 != null)
            {
                五八同城_ID = obj5.ID;
            }
            网站表 obj6 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(黄石信息港)); });

            if (obj6 != null)
            {
                黄石信息港_ID = obj6.ID;
            }
            网站表 obj7 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(住在九江)); });

            if (obj7 != null)
            {
                住在九江_ID = obj7.ID;
            }
            网站表 obj8 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(城市房产)); });

            if (obj8 != null)
            {
                城市房产_ID = obj8.ID;
            }
            网站表 obj9 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(河源置业网)); });

            if (obj9 != null)
            {
                河源置业网_ID = obj9.ID;
            }
            网站表 obj10 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(邯郸恋家网)); });

            if (obj10 != null)
            {
                邯郸恋家网_ID = obj10.ID;
            }
            网站表 obj11 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(常州房产网)); });

            if (obj11 != null)
            {
                常州房产网_ID = obj11.ID;
            }
            网站表 obj12 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(楼盘网)); });

            if (obj12 != null)
            {
                楼盘网_ID = obj12.ID;
            }
            网站表 obj13 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(搜狐二手房)); });

            if (obj13 != null)
            {
                搜狐二手房_ID = obj13.ID;
            }
            网站表 obj14 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(满堂红地产网)); });

            if (obj14 != null)
            {
                满堂红地产网_ID = obj14.ID;
            }
            网站表 obj15 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(置家网)); });

            if (obj15 != null)
            {
                置家网_ID = obj15.ID;
            }
            网站表 obj16 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(中国房产超市)); });

            if (obj16 != null)
            {
                中国房产超市_ID = obj16.ID;
            }
            网站表 obj17 = 所有网站.Find(delegate(网站表 obj) { return(obj.网站名称.Equals(中原地产)); });

            if (obj17 != null)
            {
                中原地产_ID = obj17.ID;
            }
        }
示例#6
0
        public static 网站表 GetWebById(int id)
        {
            网站表 web = 所有网站.Find(delegate(网站表 _web) { return(_web.ID == id); });

            return(web);
        }
示例#7
0
        /// <summary>
        /// 根据页面url,页面编码,正则表达式;获取相应字符串
        /// </summary>
        /// <param name="url">页面url</param>
        /// <param name="encoding">页面编码</param>
        /// <param name="dic">正则表达式规则</param>
        /// <returns></returns>
        public static Dictionary <string, List <string> > GetHtmlByRegex(string url, string encoding, Dictionary <string, RegexInfo> dic, 网站表 webObj, int cityId, string referer = null, bool keepAlive = false, int timeout = 60000)
        {
            string NowProxyIp     = null;
            int    网络异常重试次数       = 0;
            int    网络异常时代理ip更换次数  = 0;
            int    验证码异常时代理ip更换次数 = 0;
            string resultHtml     = "";
            string ipStr          = NowProxyIp == null ? "" : NowProxyIp + ",";

begin:
            Dictionary <string, List <string> > resultDic = new Dictionary <string, List <string> >();

            if (dic == null)
            {
                return(resultDic);
            }
            try
            {
                resultHtml = GetHtml(url, encoding, proxyIp: NowProxyIp, referer: referer, keepAlive: keepAlive, timeout: timeout);
            }
            catch (Exception ex)
            {
                if (网络异常重试次数 < 2)
                {
                    System.Threading.Thread.Sleep(3000);
                    网络异常重试次数++;
                    goto begin;
                }
                if (webObj.BlockadeOfIP)
                {
                    if (网络异常时代理ip更换次数 < 3)
                    {
                        System.Threading.Thread.Sleep(2000);
                        网络异常时代理ip更换次数++;

work:
                        if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行
                        {
                            System.Threading.Thread.Sleep(60000);
                            goto work;
                        }
                        ProxyIpManager.SetNotEffectiveProxyIp(webObj.ID, NowProxyIp); //设置当前代理ip为不可用
                        NowProxyIp = ProxyIpManager.GetEffectiveProxyIp(webObj.ID);   //获取新代理ip
                        ipStr      = ipStr + NowProxyIp + ",";
                        goto begin;
                    }
                }
                log.Error(string.Format("SpiderHouse:(requestUrl:{0}--请求异常)", url), ex);
                //记录爬取失败原因和信息(网络异常)
work2:
                if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行
                {
                    System.Threading.Thread.Sleep(60000);
                    goto work2;
                }
                DatSpiderErrorLogManager.InsertError(cityId, webObj.ID, url, SysCodeManager.Code_1_1, "所用ip:" + ipStr);
                goto end;
            }
end:
            resultHtml = Regex.Replace(resultHtml, @"(([\r\n])[\s]+|[\r\n]+|[\t]+)", "", RegexOptions.IgnoreCase);
            //检测是否需要验证码
            bool checkError = false;//记录是否为验证码错误

            if (webObj.BlockadeOfIPType == WebsiteManager.BlockadeOfIPType1)
            {
                List <string> checkList = GetStrByRegexByIndex(resultHtml, regex_checkcode);
                if (checkList != null && checkList.Count > 0)
                {
                    网络异常时代理ip更换次数 = 0;
                    if (验证码异常时代理ip更换次数 < 3)
                    {
                        验证码异常时代理ip更换次数++;
                        //记录爬取失败原因和信息(网络异常)
work3:
                        if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行
                        {
                            System.Threading.Thread.Sleep(60000);
                            goto work3;
                        }
                        ProxyIpManager.SetNotEffectiveProxyIp(webObj.ID, NowProxyIp); //设置当前代理ip为不可用
                        NowProxyIp = ProxyIpManager.GetEffectiveProxyIp(webObj.ID);   //获取新代理ip
                        ipStr      = ipStr + NowProxyIp + ",";
                        goto begin;
                    }
                    else
                    {
                        checkError = true;
                        //记录爬取时需要验证码
                        //记录爬取失败原因和信息(网络异常)
work4:
                        if (!WorkItemManager.CheckPassSpider())//检查数据库是否有维护程序在执行
                        {
                            System.Threading.Thread.Sleep(60000);
                            goto work4;
                        }
                        DatSpiderErrorLogManager.InsertError(cityId, webObj.ID, url, SysCodeManager.Code_1_3,
                                                             string.Format("所用ip:{0}", ipStr)
                                                             );
                    }
                    log.Debug(string.Format("SpiderHouse:(requestUrl:{0}--请求异常:需输入验证码)", url));
                }
            }
            foreach (KeyValuePair <string, RegexInfo> kvp in dic)
            {
                string        key  = kvp.Key;
                List <string> list = GetStrByRegexByIndex(resultHtml, kvp.Value);
                resultDic.Add(key, list);
                if (webObj != null && webObj.ID == WebsiteManager.搜房网_ID && resultHtml.Contains(">此房源已售出!<") && resultHtml.Contains(">您可以选择查看:<"))
                {
                    continue;
                }
                //正则表达式为重要字段&&无网络异常&&无验证码异常&&正则表达式不为null
                if (key.Contains("*") && !string.IsNullOrEmpty(resultHtml) && !checkError && kvp.Value != null && !string.IsNullOrEmpty(kvp.Value.RegexStr))
                {
                    //通过规则未获取到信息&&不为验证码异常
                    if (list == null || list.Count < 1)
                    {
                        //记录爬取失败原因和信息(通过规则未获取到字符)
                        DatSpiderErrorLogManager.InsertError(cityId, webObj.ID, url, SysCodeManager.Code_1_2,
                                                             string.Format("描述:{0},规则:{1},索引:{2},其他规则个数:{3},所用ip:{4}",
                                                                           key,
                                                                           kvp.Value.RegexStr,
                                                                           kvp.Value.RegexIndex,
                                                                           kvp.Value.RegexInfoList == null ? 0 : kvp.Value.RegexInfoList.Count,
                                                                           ipStr
                                                                           )
                                                             );
                    }
                }
            }
            if (webObj != null && webObj.ID == WebsiteManager.搜房网_ID && resultHtml.Contains(">此房源已售出!<") && resultHtml.Contains(">您可以选择查看:<"))
            {
                resultDic.Add("NotData", new List <string> {
                    "1"
                });
            }
            return(resultDic);
        }
        public void start()
        {
            string[] citys = new string[] { "济宁", "杭州", "漳州", "威海", "宜昌", "北海", "包头", "滨州", "长春", "大连", "东营",
                                            "衡水", "湖州", "金华", "九江", "盐城", "嘉兴", "聊城", "芜湖", "临沂", "南通", "秦皇岛",
                                            "衢州", "日照", "上海", "石家庄", "绍兴", "泰安", "潍坊", "襄阳", "银川", "烟台", "淄博", "镇江" };
            网站表       webObj = WebsiteManager.GetWebById(WebsiteManager.中国房产超市_ID);
            RegexInfo 总页数正则  = new RegexInfo(">下一页</a><div[^<>]*><span[^<>]*>[^<>]*</span>/([^<>]+)<", "$1");
            Dictionary <string, RegexInfo> 根页面正则字典集合 = new Dictionary <string, RegexInfo>();

            根页面正则字典集合.Add("总页数", 总页数正则);
            RegexInfo cityRegexListText = new RegexInfo("<span class=\"hui6c\"[^<>]*>((?:(?!</span).)*)</span>", "$1");
            RegexInfo cityRegexInfo     = new RegexInfo("(<a [^<>]+>[^<>]+</a>)", "$1");
            Dictionary <string, RegexInfo> cityRegexDic = new Dictionary <string, RegexInfo>();

            cityRegexDic.Add("城市列表文本", cityRegexListText);
            //获取所有城市的文本区域
            Dictionary <string, List <string> > dicCitylistText = SpiderHelp.GetHtmlByRegex("http://www.fccs.com/city/", "gb2312", cityRegexDic, WebsiteManager.GetWebById(WebsiteManager.中国房产超市_ID), CityId);
            string cityText = dicCitylistText["城市列表文本"].Count < 1 ? "" : dicCitylistText["城市列表文本"][0];

            //从文本区域中提取城市列表
            cityRegexDic.Add("城市列表", cityRegexInfo);
            dicCitylistText = SpiderHelp.GetStrByRegex(cityText, cityRegexDic);
            List <string> cityList = dicCitylistText["城市列表"];
            StringBuilder citySb   = new StringBuilder();
            StringBuilder citySb2  = new StringBuilder();
            StringBuilder citySb3  = new StringBuilder();

            foreach (string cityInfoStr in cityList)
            {
                //获取城市名称+url
                RegexInfo regexCityName = new RegexInfo("<a [^<>]+>([^<>]+)</a>", "$1");
                RegexInfo regexCityUrl  = new RegexInfo("<a[^<>]+href=\"([^<>]+)\"[^<>]*>[^<>]+</a>", "$1");
                Dictionary <string, RegexInfo> cityRegexDic2 = new Dictionary <string, RegexInfo>();
                cityRegexDic2.Add("regexCityName", regexCityName);
                cityRegexDic2.Add("regexCityUrl", regexCityUrl);
                Dictionary <string, List <string> > dicCityInfo = SpiderHelp.GetStrByRegex(cityInfoStr, cityRegexDic2);
                string cityName = StringHelp.TrimBlank(dicCityInfo["regexCityName"].Count > 0 ? dicCityInfo["regexCityName"][0] : "");
                string cityUrl  = dicCityInfo["regexCityUrl"].Count > 0 ? dicCityInfo["regexCityUrl"][0] : "";
                //判断城市是否需要爬取
                if (citys.Where(obj => obj.Equals(cityName)).FirstOrDefault() == null)
                {
                    continue;
                }
                //生成sql
                string execStr = " exec 往网站爬取配置表添加配置信息 '{0}','{1}','{2}','{3}','{4}',{5},{6}";
                城市表    city    = CityManager.Get城市_byLike城市名称(cityName);
                if (city != null && !citySb2.ToString().Contains(city.城市名称) && !citySb.ToString().Contains(city.城市名称) && SpiderWebConfigManager.get网站爬取配置_by城市ID_网站ID(city.ID, WebsiteManager.中国房产超市_ID) == null)
                {
                    string host     = cityUrl.Replace("http://", "http://second.");
                    string hostlist = host;
                    Dictionary <string, List <string> > 根页面正则字典集合结果 = SpiderHelp.GetHtmlByRegex(hostlist, "gb2312", 根页面正则字典集合, webObj, CityId);

                    execStr = string.Format(execStr, WebsiteManager.中国房产超市, city.城市名称, host, hostlist,
                                            "", "1", "1");
                    if (根页面正则字典集合结果["总页数"].Count() < 1)
                    {
                        citySb2.Append(execStr).Append("\r\n");
                        continue;
                    }
                    citySb.Append(execStr).Append("\r\n");
                }
            }
            string        result  = citySb.ToString();
            string        result2 = citySb2.ToString();
            string        result3 = citySb3.ToString();
            List <string> test    = citys.Where(obj => !result.Contains(obj)).ToList();

            导出任务计划配置文件();
        }