protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site()) { UserId = "DotnetSpider", TaskGroup = "HaoBrowser", Identity = "HaoBrowser Hao360Spider Buble " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"), CachedSize = 1, ThreadNum = 1, SkipWhenResultIsEmpty = true, Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "sales[\"hotsite_yixing\"] = [", End = "}}", StartOffset = 27, EndOffset = 0 }, new ReplaceContentHandler { NewValue = "/", OldValue = "\\/", }, } } }; context.SetScheduler(new Extension.Scheduler.RedisScheduler("127.0.0.1:6379,serviceName=Scheduler.NET,keepAlive=8,allowAdmin=True,connectTimeout=10000,password=6GS9F2QTkP36GggE0c3XwVwI,abortConnect=True,connectRetry=20")); context.AddPipeline(new MySqlEntityPipeline("Database='testhao';Data Source= localhost;User ID=root;Password=root@123456;Port=3306")); context.AddStartUrl("https://hao.360.cn/"); context.AddEntityType(typeof(UpdateHao360Info)); return(context); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site()); context.SetSite(new Site()); context.SetThreadNum(2); context.ThreadNum = 1; context.RetryWhenResultIsEmpty = false; context.Deep = 100; context.EmptySleepTime = 5000; context.SetEmptySleepTime(5000); context.ExitWhenComplete = true; context.CachedSize = 1; context.SetDownloader(new HttpClientDownloader()); context.SetScheduler(new QueueDuplicateRemovedScheduler()); context.SkipWhenResultIsEmpty = true; context.SpawnUrl = true; context.AddPipeline(new CollectEntityPipeline()); context.AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml"); context.AddEntityType(typeof(ArticleSummary)); Name = "qidian"; Batch = DateTime.Now.ToString("yyyy_MM_dd_HHmmss"); return(context); }
public void ClearScheduler() { EntitySpider spider = new EntitySpider(new Site()); spider.Identity = Guid.NewGuid().ToString("N"); spider.SetScheduler(new RedisScheduler { Host = "localhost", Password = "******" }); spider.AddStartUrl("https://baidu.com"); spider.AddEntityPipeline(new ConsoleEntityPipeline()); spider.AddEntityType(typeof(TestEntity)); spider.Run(); var confiruation = new ConfigurationOptions() { ServiceName = "DotnetSpider", ConnectTimeout = 65530, KeepAlive = 8, ConnectRetry = 3, ResponseTimeout = 3000, Password = "******", AllowAdmin = true }; confiruation.EndPoints.Add(new DnsEndPoint("127.0.0.1", 6379)); var redis = ConnectionMultiplexer.Connect(confiruation); var db = redis.GetDatabase(0); var md5 = Encrypt.Md5Encrypt(spider.Identity); var itemKey = "item-" + md5; var setKey = "set-" + md5; var queueKey = "queue-" + md5; var errorCountKey = "error-record" + md5; var successCountKey = "success-record" + md5; //queue Assert.Equal(0, db.ListLength(queueKey)); //set Assert.Equal(0, db.SetLength(setKey)); //item Assert.Equal(0, db.HashLength(itemKey)); //error-count Assert.Equal(false, db.StringGet(errorCountKey).HasValue); //success-count Assert.Equal(false, db.StringGet(successCountKey).HasValue); }
protected override EntitySpider GetEntitySpider() { //Connecting string const string connstr = "Data Source=localhost;Initial Catalog=test;User ID=sa;Password=1234"; EntitySpider context = new EntitySpider(new Site { }) { UserId = "DotnetSpider", TaskGroup = "RuthSpider" }; context.SetThreadNum(1); context.SetIdentity("RuthSpider " + DateTime.Now.ToString("yyyy_MM_dd_hhmmss")); context.AddEntityPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://www.tsia.org.tw/member_list.php?page=1"); //添加公司列表頁面Entity context.AddEntityType(typeof(CompanySummary), new TargetUrlExtractor { Patterns = new List <string> { @"member_list.php\?page=\d+" } }); //添加公司詳情頁面Entity context.AddEntityType(typeof(Company), new TargetUrlExtractor { Patterns = new List <string> { @"member_info.php\?ID=\d+" } }); //Config Redis context.SetScheduler(new RedisScheduler { Host = "localhost", Password = "", Port = 6379 }); return(context); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site()) { UserId = "DotnetSpider", TaskGroup = "HaoBrowser", Identity = "HaoBrowser Hao360Spider Buble " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"), CachedSize = 1, ThreadNum = 1, SkipWhenResultIsEmpty = true, Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "sales[\"hotsite_yixing\"] = [", End = "}}", StartOffset = 27, EndOffset = 0 }, new ReplaceContentHandler { NewValue = "/", OldValue = "\\/", }, } } }; context.SetScheduler(new Extension.Scheduler.RedisScheduler { Host = "127.0.0.1", Port = 6379, Password = "******" }); context.AddEntityPipeline(new MySqlEntityPipeline("Database='testhao';Data Source= 127.0.0.1;User ID=root;Password=root@123456;Port=4306")); context.AddStartUrl("https://hao.360.cn/"); context.AddEntityType(typeof(UpdateHao360Info)); return(context); }