/// <summary> /// 创建具体事务对象 /// </summary> /// <param name="Name"></param> /// <returns></returns> public ISimpleCrawler Create(string Name, CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { ISimpleCrawler myExecuteTran = null; try { Type type = Type.GetType(Name, true); myExecuteTran = (ISimpleCrawler)Activator.CreateInstance(type, _Settings, _filter, _dataop); } catch (TypeLoadException e) { } return(myExecuteTran); }
/// <summary> /// The main. /// </summary> /// <param name="args"> /// The args. /// </param> private static void Main(string[] args) { if (args.Count() > 0) { crawlerClassName = args[0]; } if (string.IsNullOrEmpty(crawlerClassName)) { Console.WriteLine("请-classname 设置对应的爬取类"); crawlerClassName = Console.ReadLine(); if (string.IsNullOrEmpty(crawlerClassName)) { return; } } var factoryClassName = string.Format("SimpleCrawler.Demo.{0}", crawlerClassName); filter = new BloomFilter <string>(5000000); //LandFangUserUpdateCrawler,LandFangCrawler //SimpleCrawler.Demo.LandFangUserUpdateCrawler 通过模拟登陆更新*号数据 //LandFangCityRegionCrawler 获取城市区县市的guidCode对应 //LandFangCityRegionUpdateCrawler 更新交易状态与区县 //QiXinEnterpriseCrawler 启信爬取对应 企业与guid Console.WriteLine(connStr); Console.WriteLine(crawlerClassName); Console.WriteLine("确认数据库连接后继续进行"); simpleCrawler = SimpleCrawlerFactory.Instance.Create(factoryClassName, Settings, filter, dataop); //Console.ReadLine(); //const string CityName = "beijing"; // 设置种子地址 需要添加布隆过滤种子地址,防止重新2次读取种子地址 //Settings.SeedsAddress.Add(string.Format("http://jobs.zhaopin.com/{0}", CityName)); // Settings.SeedsAddress.Add(string.Format("http://www.fzhouse.com.cn:7002/result_new.asp")); // 设置 URL 关键字 //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName)); //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName)); //Settings.HrefKeywords.Add(string.Format("building.asp?ProjectID=")); //Settings.HrefKeywords.Add(string.Format("result_new")); // 设置爬取线程个数 //Settings.ThreadCount = 5; //Settings.ThreadCount =1; // 设置爬取深度 Settings.Depth = 27; // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个 Settings.EscapeLinks.Add(".jpg"); // 设置自动限速,1~5 秒随机间隔的自动限速 Settings.AutoSpeedLimit = false; // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com Settings.LockHost = false; //是否启用代理 Settings.CurWebProxy = GetWebProxy(); Settings.CurWebProxyString = GetWebProxyString(); // 设置请求的 User-Agent HTTP 标头的值 // settings.UserAgent 已提供默认值,如有特殊需求则自行设置 // 设置请求页面的超时时间,默认值 15000 毫秒 // settings.Timeout 按照自己的要求确定超时时间 // 设置用于过滤的正则表达式 // settings.RegularFilterExpressions.Add("http://land.fang.com/market/a0a95a6f-43d4-4b59-a948-d48f21a4e468.html"); //代理ip模式 //Settings.IPProxyList = new List<IPProxy>(); //var ipProxyList = dataop.FindAllByQuery("IPProxy", Query.NE("status", "1")).ToList(); //Settings.IPProxyList.AddRange(ipProxyList.Select(c => new IPProxy(c.Text("ip")))); // Settings.IPProxyList.Add(new IPProxy("31.168.236.236:8080")); //云风Bloginit初始化 // fang99Init(); // JGZFBlogInit(); simpleCrawler.SettingInit(); StartDBChangeProcess(); var master = new CrawlMaster(Settings); master.AddUrlEvent += MasterAddUrlEvent; master.DataReceivedEvent += MasterDataReceivedEvent; master.CrawlErrorEvent += CrawlErrorEvent; master.Crawl(); // Console.WriteLine("遍历结束"); if (UrlQueue.Instance.Count > 0) { Console.ReadKey(); } }