public void CloseSignal() { Spider spider = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8" }, new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.ClearSchedulerAfterCompleted = false; for (int i = 0; i < 20; ++i) { spider.AddStartUrl($"http://www.baidu.com/_t={i}"); } var task = spider.RunAsync(); Thread.Sleep(500); spider.SendExitSignal(); task.Wait(); Assert.Equal(10, spider.Scheduler.SuccessRequestsCount); Spider spider2 = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8" }, new TestPageProcessor()).AddPipeline(new TestPipeline()); spider2.ClearSchedulerAfterCompleted = false; for (int i = 0; i < 25; ++i) { spider2.AddStartUrl($"http://www.baidu.com/_t={i}"); } spider2.Run(); Assert.Equal(25, spider2.Scheduler.SuccessRequestsCount); }
public void DatebaseLogAndStatus() { Env.Reload(); string id = Guid.NewGuid().ToString("N"); string taskGroup = Guid.NewGuid().ToString("N"); string userId = Guid.NewGuid().ToString("N"); using (Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, id, new QueueDuplicateRemovedScheduler(), new TestPageProcessor())) { spider.Monitor = new DbMonitor(spider.TaskId, spider.Identity); spider.AddPipeline(new TestPipeline()); spider.ThreadNum = 1; for (int i = 0; i < 5; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.Run(); } Thread.Sleep(3000); using (var conn = (Env.SystemConnectionStringSettings.GetDbConnection())) { Assert.StartsWith("Crawl complete, cost", conn.Query <Log>($"SELECT * FROM DotnetSpider.Log where Identity='{id}'").Last().message); Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM DotnetSpider.Status where Identity='{id}'").First().Count); Assert.Equal("Finished", conn.Query <statusObj>($"SELECT * FROM DotnetSpider.Status where Identity='{id}'").First().status); } }
public void DatebaseLogAndStatus() { string id = Guid.NewGuid().ToString("N"); string taskGroup = Guid.NewGuid().ToString("N"); string userId = Guid.NewGuid().ToString("N"); using (Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, id, new QueueDuplicateRemovedScheduler(), new TestPageProcessor())) { spider.Monitor = new MySqlMonitor(spider.TaskId, spider.Identity, false, "Database='mysql';Data Source=localhost;User ID=root;Port=3306;SslMode=None;"); spider.AddPipeline(new TestPipeline()); spider.ThreadNum = 1; for (int i = 0; i < 5; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.Run(); } Thread.Sleep(3000); using (var conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Port=3306;SslMode=None;")) { Assert.StartsWith("Crawl complete, cost", conn.Query <Log>($"SELECT * FROM DotnetSpider.Log where Identity='{id}'").Last().message); Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM DotnetSpider.Status where Identity='{id}'").First().Count); Assert.Equal("Finished", conn.Query <statusObj>($"SELECT * FROM DotnetSpider.Status where Identity='{id}'").First().status); } }
public void DatebaseLogAndStatus() { LogUtil.Init(); string id = Guid.NewGuid().ToString("N"); Env.NodeId = "DEFAULT"; using (Spider spider = Spider.Create(new Site { EncodingName = "UTF-8" }, id, new QueueDuplicateRemovedScheduler(), new TestPageProcessor())) { spider.Downloader = new TestDownloader(); spider.TaskId = "1"; spider.Monitor = new MySqlMonitor(spider.TaskId, spider.Identity, false, "Database='mysql';Data Source=localhost;User ID=root;Port=3306;SslMode=None;"); spider.AddPipeline(new TestPipeline()); for (int i = 0; i < 5; i++) { Serilog.Log.Logger.Information("add start url" + i, id); spider.AddStartUrl("http://www.baidu.com/" + i); } spider.EmptySleepTime = 1000; spider.Run(); } using (var conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Port=3306;SslMode=None;")) { var logs = conn.Query <Log>($"SELECT * FROM dotnetspider.log where identity='{id}'").ToList(); Assert.StartsWith("Crawl complete, cost", logs[logs.Count - 1].message); Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.status where identity='{id}'").First().Count); Assert.Equal("Finished", conn.Query <statusObj>($"SELECT * FROM dotnetspider.status where identity='{id}'").First().status); } }
public void DatebaseLogAndStatus() { string id = Guid.NewGuid().ToString("N"); string taskGroup = Guid.NewGuid().ToString("N"); string userId = Guid.NewGuid().ToString("N"); string connectString = "Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"; Configuration.SetValue("logAndStatusConnectString", connectString); Assert.Equal("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306", Configuration.GetValue("logAndStatusConnectString")); using (Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", MinSleepTime = 1000 }, id, userId, taskGroup, new TestPageProcessor(), new QueueDuplicateRemovedScheduler())) { spider.AddPipeline(new TestPipeline()).SetThreadNum(1); for (int i = 0; i < 5; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } MonitorCenter.Register(spider); spider.Run(); } using (MySqlConnection conn = new MySqlConnection(connectString)) { Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.status where userid='{userId}' and taskgroup='{taskGroup}' and identity='{id}'").First().Count); Assert.Equal(7, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.log where userid='{userId}' and taskgroup='{taskGroup}' and identity='{id}'").First().Count); } }
public static void Main(string[] args) { var path = "www.baidu.com.cookies"; if (File.Exists(path)) { File.Delete(path); } File.WriteAllText(path, "a=b&c=d"); Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.ThreadNum = 1; var downloader = new HttpClientDownloader(); downloader.AddAfterDownloadCompleteHandler(new TimerUpdateCookieHandler(5, new FileCookieInject())); spider.Downloader = downloader; for (int i = 0; i < 10000; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.Run(); }
public void DatebaseLogAndStatus() { string id = Guid.NewGuid().ToString("N"); string taskGroup = Guid.NewGuid().ToString("N"); string userId = Guid.NewGuid().ToString("N"); string connectString = "Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"; Core.Infrastructure.Config.SetValue("connectString", connectString); Assert.AreEqual("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306", Core.Infrastructure.Config.GetValue("connectString")); using (Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, id, new QueueDuplicateRemovedScheduler(), new TestPageProcessor())) { spider.AddPipeline(new TestPipeline()); spider.ThreadNum = 1; for (int i = 0; i < 5; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.Monitor = new DbMonitor(id); spider.Run(); } using (MySqlConnection conn = new MySqlConnection(connectString)) { Assert.AreEqual(15, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.log where identity='{id}'").First().Count); Assert.AreEqual(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.status where identity='{id}'").First().Count); } }
public void FastExit() { if (Environment.GetEnvironmentVariable("TRAVIS") == "1") { return; } var path = "FastExit_Result.txt"; if (File.Exists(path)) { File.Delete(path); } Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Spider spider = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8", SleepTime = 0 }, new FastExitPageProcessor()) .AddPipeline(new FastExitPipeline()); spider.ThreadNum = 1; spider.EmptySleepTime = 0; spider.AddStartUrl("http://item.jd.com/1013286.html?_t=1"); spider.AddStartUrl("http://item.jd.com/1013286.html?_t=2"); spider.AddStartUrl("http://item.jd.com/1013286.html?_t=3"); spider.AddStartUrl("http://item.jd.com/1013286.html?_t=4"); spider.AddStartUrl("http://item.jd.com/1013286.html?_t=5"); spider.Run(); stopwatch.Stop(); var costTime = stopwatch.ElapsedMilliseconds; Assert.True(costTime < 3000); var results = File.ReadAllLines("FastExit_Result.txt"); Assert.Contains("http://item.jd.com/1013286.html?_t=1", results); Assert.Contains("http://item.jd.com/1013286.html?_t=2", results); Assert.Contains("http://item.jd.com/1013286.html?_t=3", results); Assert.Contains("http://item.jd.com/1013286.html?_t=4", results); Assert.Contains("http://item.jd.com/1013286.html?_t=5", results); }
public void RetryWhenResultIsEmpty() { Spider spider = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8", SleepTime = 1000 }, new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.ThreadNum = 1; spider.AddStartUrl("http://taobao.com"); spider.Run(); Assert.Equal(Status.Finished, spider.Status); }
public void TestRetryWhenResultIsEmpty() { Spider spider = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8", MinSleepTime = 1000, Timeout = 20000 }, new TestPageProcessor()).AddPipeline(new TestPipeline()).SetThreadNum(1); spider.AddStartUrl("http://taobao.com"); spider.RetryWhenResultIsEmpty = true; spider.Run(); Assert.AreEqual(Status.Finished, spider.StatusCode); }
public void RunAsyncAndStop() { Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", MinSleepTime = 1000 }, new TestPageProcessor()).AddPipeline(new TestPipeline()).SetThreadNum(1); for (int i = 0; i < 10000; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.RunAsync(); Thread.Sleep(5000); spider.Stop(); Thread.Sleep(5000); spider.RunAsync(); Thread.Sleep(5000); }
public void RunAsyncAndStopThenExit() { Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.ThreadNum = 1; for (int i = 0; i < 10000; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.RunAsync(); Thread.Sleep(5000); spider.Pause(() => { spider.Exit(); }); Thread.Sleep(5000); }
public static void Main(string[] args) { IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>(); Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", MinSleepTime = 1000 }, new SpiderTest.TestPageProcessor()).AddPipeline(new SpiderTest.TestPipeline()).SetThreadNum(1); spider.SetDownloader(new TestDownloader()); for (int i = 0; i < 10000; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.Run(); Thread.Sleep(5000); spider.Stop(); Thread.Sleep(5000); spider.RunAsync(); Thread.Sleep(5000); }
public static void Main(string[] args) { #if NET_CORE Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif IocContainer.Default.AddSingleton <IMonitor, NLogMonitor>(); Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", MinSleepTime = 1000 }, new SpiderTest.TestPageProcessor()).AddPipeline(new SpiderTest.TestPipeline()).SetThreadNum(1); spider.SetDownloader(new TestDownloader()); for (int i = 0; i < 10; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.Run(); Thread.Sleep(5000); spider.Stop(); Thread.Sleep(5000); spider.RunAsync(); Thread.Sleep(5000); }
public void RunAsyncAndContiune() { if (Environment.GetEnvironmentVariable("TRAVIS") == "1") { return; } Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.ThreadNum = 1; for (int i = 0; i < 10000; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.RunAsync(); Thread.Sleep(5000); spider.Pause(() => { spider.Contiune(); }); Thread.Sleep(5000); }