Beispiel #1
0
        /// <summary>
        /// 创建具体事务对象
        /// </summary>
        /// <param name="Name"></param>
        /// <returns></returns>
        public ISimpleCrawler Create(string Name, CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
        {
            ISimpleCrawler myExecuteTran = null;

            try
            {
                Type type = Type.GetType(Name, true);
                myExecuteTran = (ISimpleCrawler)Activator.CreateInstance(type, _Settings, _filter, _dataop);
            }
            catch (TypeLoadException e)
            {
            }
            return(myExecuteTran);
        }
Beispiel #2
0
        /// <summary>
        /// The main.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        private static void Main(string[] args)
        {
            if (args.Count() > 0)
            {
                crawlerClassName = args[0];
            }
            if (string.IsNullOrEmpty(crawlerClassName))
            {
                Console.WriteLine("请-classname 设置对应的爬取类");
                crawlerClassName = Console.ReadLine();
                if (string.IsNullOrEmpty(crawlerClassName))
                {
                    return;
                }
            }

            var factoryClassName = string.Format("SimpleCrawler.Demo.{0}", crawlerClassName);

            filter = new BloomFilter <string>(5000000);
            //LandFangUserUpdateCrawler,LandFangCrawler
            //SimpleCrawler.Demo.LandFangUserUpdateCrawler 通过模拟登陆更新*号数据
            //LandFangCityRegionCrawler 获取城市区县市的guidCode对应
            //LandFangCityRegionUpdateCrawler 更新交易状态与区县
            //QiXinEnterpriseCrawler  启信爬取对应 企业与guid
            Console.WriteLine(connStr);
            Console.WriteLine(crawlerClassName);
            Console.WriteLine("确认数据库连接后继续进行");
            simpleCrawler = SimpleCrawlerFactory.Instance.Create(factoryClassName, Settings, filter, dataop);
            //Console.ReadLine();
            //const string CityName = "beijing";
            // 设置种子地址 需要添加布隆过滤种子地址,防止重新2次读取种子地址
            //Settings.SeedsAddress.Add(string.Format("http://jobs.zhaopin.com/{0}", CityName));
            // Settings.SeedsAddress.Add(string.Format("http://www.fzhouse.com.cn:7002/result_new.asp"));
            // 设置 URL 关键字
            //Settings.HrefKeywords.Add(string.Format("/{0}/bj", CityName));
            //Settings.HrefKeywords.Add(string.Format("/{0}/sj", CityName));
            //Settings.HrefKeywords.Add(string.Format("building.asp?ProjectID="));
            //Settings.HrefKeywords.Add(string.Format("result_new"));
            // 设置爬取线程个数
            //Settings.ThreadCount = 5;
            //Settings.ThreadCount =1;
            // 设置爬取深度
            Settings.Depth = 27;

            // 设置爬取时忽略的 Link,通过后缀名的方式,可以添加多个
            Settings.EscapeLinks.Add(".jpg");

            // 设置自动限速,1~5 秒随机间隔的自动限速
            Settings.AutoSpeedLimit = false;

            // 设置都是锁定域名,去除二级域名后,判断域名是否相等,相等则认为是同一个站点
            // 例如:mail.pzcast.com 和 www.pzcast.com
            Settings.LockHost = false;
            //是否启用代理
            Settings.CurWebProxy       = GetWebProxy();
            Settings.CurWebProxyString = GetWebProxyString();


            // 设置请求的 User-Agent HTTP 标头的值
            // settings.UserAgent 已提供默认值,如有特殊需求则自行设置

            // 设置请求页面的超时时间,默认值 15000 毫秒
            // settings.Timeout 按照自己的要求确定超时时间

            // 设置用于过滤的正则表达式
            // settings.RegularFilterExpressions.Add("http://land.fang.com/market/a0a95a6f-43d4-4b59-a948-d48f21a4e468.html");
            //代理ip模式
            //Settings.IPProxyList = new List<IPProxy>();
            //var ipProxyList = dataop.FindAllByQuery("IPProxy", Query.NE("status", "1")).ToList();
            //Settings.IPProxyList.AddRange(ipProxyList.Select(c => new IPProxy(c.Text("ip"))));
            // Settings.IPProxyList.Add(new IPProxy("31.168.236.236:8080"));
            //云风Bloginit初始化
            // fang99Init();
            // JGZFBlogInit();
            simpleCrawler.SettingInit();
            StartDBChangeProcess();
            var master = new CrawlMaster(Settings);

            master.AddUrlEvent       += MasterAddUrlEvent;
            master.DataReceivedEvent += MasterDataReceivedEvent;
            master.CrawlErrorEvent   += CrawlErrorEvent;
            master.Crawl();
            // Console.WriteLine("遍历结束");
            if (UrlQueue.Instance.Count > 0)
            {
                Console.ReadKey();
            }
        }