public void ClearTest() { var scheduler = new RedisScheduler(_state); for (var i = 0; i < 100; ++i) { var page = CreatePageToCrawl("http://www.test.com/", i + ".html"); scheduler.Add(page); } scheduler.Clear(); Assert.That(scheduler.Count, Is.EqualTo(0)); var server = _state.Connection.GetServer(_state.Connection.GetEndPoints().First()); var count = server.Keys(_database, pattern: "CrawledPage_Test_*").LongCount(); Assert.That(count, Is.EqualTo(0L)); }
public void AddTest() { var scheduler = new RedisScheduler(_state); var page = CreatePageToCrawl("http://www.test.com", 1 + ".html"); scheduler.Add(page); var listKey = "PageToCrawl_Test"; var url = "CrawledPage_Test_http://www.test.com/1.html"; var db = _state.Connection.GetDatabase(); Assert.That(db.KeyExists(listKey), Is.True); Assert.That(db.KeyExists(url), Is.True); Assert.That(db.ListLength(listKey), Is.EqualTo(1L)); var json = (string) db.ListGetByIndex(listKey, 0); var page2 = JsonConvert.DeserializeObject<PageToCrawl>(json); Assert.That(page.Uri, Is.EqualTo(page2.Uri)); }
protected override void MyInit(params string[] arguments) { ThreadNum = 1; Scheduler = new RedisScheduler("127.0.0.1:6379,serviceName=Scheduler.NET,keepAlive=8,allowAdmin=True,connectTimeout=10000,password=,abortConnect=True,connectRetry=20"); Downloader = new HttpClientDownloader(); //Downloader.AddAfterDownloadCompleteHandler(new SubContentHandler //{ // Start = "json(", // End = ");", // StartOffset = 5, // EndOffset = 0 //}); AddStartUrlBuilder( new DbStartUrlBuilder(Database.MySql, "Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;", $"SELECT * FROM jd.sku_v2_{DateTimeUtils.RunIdOfMonday} WHERE shopname is null or shopid is null order by sku", new[] { "sku" }, "http://chat1.jd.com/api/checkChat?my=list&pidList={0}&callback=json")); AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;")); AddEntityType(typeof(ProductUpdater)); }
public MonitorSpiderListener(Core.Spider spider) { _spider = spider; if (spider.SaveStatusToRedis) { Task.Factory.StartNew(() => { #if !NET_CORE RedisScheduler scheduler = spider.Scheduler as RedisScheduler; if (scheduler != null) { ConnectionMultiplexer redis = scheduler.Redis; IDatabase db = redis.GetDatabase(0); while (true) { try { if (Closed) { UpdateStatus(db); break; } UpdateStatus(db); } catch (Exception) { // ignored } Thread.Sleep(3000); } } #endif }); } }
public void RedisTest() { RedisScheduler redisScheduler = new RedisScheduler("localhost", ""); ISpider spider = new DefaultSpider(); redisScheduler.Clear(); Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", 1, null); request.PutExtra("1", "2"); redisScheduler.Push(request); Request result = redisScheduler.Poll(); Assert.AreEqual("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", result.Url.ToString()); Request result1 = redisScheduler.Poll(); Assert.IsNull(result1); redisScheduler.Dispose(); redisScheduler.Clear(); }
public void Redis_QueueTest() { RedisScheduler redisScheduler = new RedisScheduler("localhost", ""); ISpider spider = new DefaultSpider(); Request request1 = new Request("http://www.ibm.com/1", 1, null); Request request2 = new Request("http://www.ibm.com/2", 1, null); Request request3 = new Request("http://www.ibm.com/3", 1, null); Request request4 = new Request("http://www.ibm.com/4", 1, null); redisScheduler.Push(request1); redisScheduler.Push(request2); redisScheduler.Push(request3); redisScheduler.Push(request4); Request result = redisScheduler.Poll(); Assert.AreEqual("http://www.ibm.com/4", result.Url.ToString()); Request result1 = redisScheduler.Poll(); Assert.AreEqual("http://www.ibm.com/3", result1.Url.ToString()); redisScheduler.Dispose(); }
protected override void MyInit() { CachedSize = 1; ThreadNum = 8; Scheduler = new RedisScheduler("127.0.0.1:6379,serviceName=Scheduler.NET,keepAlive=8,allowAdmin=True,connectTimeout=10000,password=6GS9F2QTkP36GggE0c3XwVwI,abortConnect=True,connectRetry=20"); Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "json(", End = ");", StartOffset = 5, EndOffset = 0 } } }; PrepareStartUrls = new PrepareStartUrls[] { new BaseDbPrepareStartUrls() { Source = DataSource.MySql, ConnectString = "Database='test';Data Source= localhost;User ID=root;Password=1qazZAQ!;Port=3306", QueryString = $"SELECT * FROM jd.sku_v2_{DateTimeUtils.RunIdOfMonday} WHERE shopname is null or shopid is null order by sku", Columns = new [] { new DataColumn { Name = "sku" } }, FormateStrings = new List <string> { "http://chat1.jd.com/api/checkChat?my=list&pidList={0}&callback=json" } } }; AddPipeline(new MySqlEntityPipeline("Database='taobao';Data Source=localhost ;User ID=root;Password=1qazZAQ!;Port=4306")); AddEntityType(typeof(ProductUpdater)); }
protected override void MyInit() { Site site = new Site(); using (var reader = new StreamReader(File.OpenRead("taobaokeyword.txt"))) { string keyword; while (!string.IsNullOrEmpty(keyword = reader.ReadLine())) { site.AddStartUrl("https://" + $"s.taobao.com/search?q={keyword}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}&fs=1&filter_tianmao=tmall", new Dictionary <string, object> { { "keyword", keyword } }); } } Site = site; ThreadNum = 5; SkipWhenResultIsEmpty = true; Scheduler = new RedisScheduler("127.0.0.1:6379,serviceName = DotnetSpider,keepAlive = 8,allowAdmin = True,connectTimeout = 10000,password = 6GS9F2QTkP36GggE0c3XwVwI,abortConnect = True,connectRetry = 20"); Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { StartOffset = 16, EndOffset = 22, Start = "g_page_config = {", End = "g_srp_loadCss();" }, new IncrementTargetUrlsCreator("&s=0", null, 44) } }; AddPipeline(new MySqlEntityPipeline("Database = 'mysql'; Data Source = localhost; User ID = root; Password = 1qazZAQ!; Port = 3306")); AddEntityType(typeof(Item), new MyDataHanlder()); }
protected override void MyInit(params string[] arguments) { Scheduler = new RedisScheduler(Config.RedisConnectString); var downloader = new HttpClientDownloader(); downloader.AddAfterDownloadCompleteHandler(new ReplaceContentHandler { NewValue = "/", OldValue = "\\/", }); downloader.AddAfterDownloadCompleteHandler(new IncrementTargetUrlsCreator("&s=0", null, 44)); Downloader = downloader; ThreadNum = 1; SkipWhenResultIsEmpty = true; if (!arguments.Contains("noprepare")) { PrepareStartUrls = new PrepareStartUrls[] { new BaseDbPrepareStartUrls { BulkInsert = true, ConnectString = Config.ConnectString, QueryString = "SELECT * FROM taobao.result_keywords limit 10000", Columns = new [] { new DataColumn("bidwordstr"), new DataColumn("tab") }, FormateStrings = new List <string> { "https://s.taobao.com/search?q={0}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}" } } }; } AddEntityType(typeof(Item), new MyDataHanlder()); }
public void RedisTest() { RedisScheduler redisScheduler = new RedisScheduler("localhost", ""); ISpider spider = new TestSpider(); RedisSchedulerManager m = new RedisSchedulerManager("localhost"); m.RemoveTask(spider.Identity); Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", 1, null); request.PutExtra("1", "2"); redisScheduler.Push(request, spider); Request result = redisScheduler.Poll(spider); Assert.AreEqual("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/", result.Url.ToString()); Request result1 = redisScheduler.Poll(spider); Assert.IsNull(result1); redisScheduler.Dispose(); m.RemoveTask(spider.Identity); }