public void CloseSignal() { Spider spider = Spider.Create( new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.EncodingName = "UTF-8"; spider.CycleRetryTimes = 5; spider.ClearSchedulerAfterCompleted = false; for (int i = 0; i < 20; ++i) { spider.AddRequests($"http://www.baidu.com/t={i}"); } var task = spider.RunAsync(); Thread.Sleep(500); spider.SendExitSignal(); task.Wait(); Assert.Equal(10, spider.Scheduler.SuccessRequestsCount); Spider spider2 = Spider.Create( new TestPageProcessor()).AddPipeline(new TestPipeline()); spider2.ClearSchedulerAfterCompleted = false; spider2.EncodingName = "UTF-8"; spider2.CycleRetryTimes = 5; for (int i = 0; i < 25; ++i) { spider2.AddRequests($"http://www.baidu.com/t={i}"); } spider2.Run(); Assert.Equal(25, spider2.Scheduler.SuccessRequestsCount); }
public void DatebaseLogAndStatus() { string id = Guid.NewGuid().ToString("N"); Env.NodeId = "DEFAULT"; using (Spider spider = Spider.Create( id, new QueueDuplicateRemovedScheduler(), new TestPageProcessor())) { spider.EncodingName = "UTF-8"; spider.Downloader = new TestDownloader(); spider.TaskId = "1"; spider.Monitor = new MySqlMonitor(spider.TaskId, spider.Identity, false, "Database='mysql';Data Source=localhost;User ID=root;Port=3306;SslMode=None;"); spider.AddPipeline(new TestPipeline()); for (int i = 0; i < 5; i++) { Serilog.Log.Logger.Information("add start url" + i, id); spider.AddRequests("http://www.baidu.com/" + i); } spider.EmptySleepTime = 1000; spider.Run(); } using (var conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Port=3306;SslMode=None;")) { var logs = conn.Query <Log>($"SELECT * FROM dotnetspider.log where identity='{id}'").ToList(); Assert.StartsWith("Crawl complete, cost", logs[logs.Count - 1].message); Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.status where identity='{id}'").First().Count); Assert.Equal("Finished", conn.Query <statusObj>($"SELECT * FROM dotnetspider.status where identity='{id}'").First().status); } }
public static void Run() { Spider spider = Spider.Create( // use memoery queue scheduler new QueueDuplicateRemovedScheduler(), // default page processor will save whole html, and extract urls to target urls via regex new DefaultPageProcessor { Filter = new PatternFilter(new[] { "cnblogs\\.com" }), RequestExtractor = new XPathRequestExtractor(".") }) // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd .AddPipeline(new FilePipeline()); // dowload html by http client spider.Downloader = new HttpClientDownloader(); spider.Name = "CNBLOGS"; // 4 threads 4线程 spider.ThreadNum = 4; spider.TaskId = "cnblogs"; // traversal deep 遍历深度 spider.Depth = 3; spider.EncodingName = "UTF-8"; // stop crawler if it can't get url from the scheduler after 30000 ms 当爬虫连续30秒无法从调度中心取得需要采集的链接时结束. spider.EmptySleepTime = 30000; // Set start/seed url spider.AddRequests("http://www.cnblogs.com"); // start crawler 启动爬虫 spider.Run(); }
public static void Run() { var table = new TableInfo("youku", "show", TableNamePostfix.Today); var selector = new Selector("//div[@class='yk-pack pack-film']"); var fields = new[] { new FieldSelector(".//img[@class='quic']/@alt", "name"), new FieldSelector("index", "index", SelectorType.Enviroment, DataType.Int), new FieldSelector("", "id", SelectorType.Enviroment, DataType.Int) { IsPrimary = true }, }; var targetRequestSelector = new TargetRequestSelector("//ul[@class='yk-pages']"); var model = new ModelDefinition(selector, fields, table, targetRequestSelector); var json = JsonConvert.SerializeObject(model); Spider spider = Spider.Create( new QueueDuplicateRemovedScheduler(), new ModelProcessor(model)) .AddPipeline(new ConsoleEntityPipeline()); spider.Name = "Youku"; spider.EncodingName = "UTF-8"; spider.TaskId = "1"; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始采集链接 spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html"); } spider.Run(); }
public void FastExit() { if (Environment.GetEnvironmentVariable("TRAVIS") == "1") { return; } var path = "FastExit_Result.txt"; if (File.Exists(path)) { File.Delete(path); } Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Spider spider = Spider.Create( new FastExitPageProcessor()) .AddPipeline(new FastExitPipeline()); spider.ThreadNum = 1; spider.EmptySleepTime = 0; spider.EncodingName = "UTF-8"; spider.CycleRetryTimes = 5; spider.SleepTime = 0; spider.AddRequests("http://war.163.com/"); spider.AddRequests("http://sports.163.com/"); spider.AddRequests("http://ent.163.com/"); spider.Downloader = new TestDownloader(); spider.Run(); stopwatch.Stop(); var costTime = stopwatch.ElapsedMilliseconds; Assert.True(costTime < 3000); var results = File.ReadAllLines("FastExit_Result.txt"); Assert.Contains("http://war.163.com/", results); Assert.Contains("http://sports.163.com/", results); Assert.Contains("http://ent.163.com/", results); }
public void RetryWhenResultIsEmpty() { Spider spider = Spider.Create(new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.ThreadNum = 1; spider.EncodingName = "UTF-8"; spider.CycleRetryTimes = 5; spider.SleepTime = 1000; spider.AddRequests("http://taobao.com"); spider.Run(); Assert.Equal(Status.Finished, spider.Status); }
public void RetryRequest() { var scheduler = new QueueDuplicateRemovedScheduler(); Spider spider = Spider.Create( // crawler identity "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"), // use memoery queue scheduler scheduler, // default page processor will save whole html, and extract urls to target urls via regex new TestPageProcessor()) // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd .AddPipeline(new FilePipeline()); spider.Monitor = new LogMonitor(); // dowload html by http client spider.Downloader = new HttpClientDownloader(); spider.EncodingName = "UTF-8"; spider.ThreadNum = 1; // traversal deep 遍历深度 spider.Depth = 3; spider.ClearSchedulerAfterCompleted = false; spider.EmptySleepTime = 6000; // start crawler 启动爬虫 spider.AddRequests("http://www.baidu.com"); spider.AddRequests("http://www.163.com/"); spider.Run(); Assert.Equal(5, spider.RetriedTimes.Value); Assert.Equal(0, scheduler.LeftRequestsCount); Assert.Equal(1, scheduler.SuccessRequestsCount); // 重试次数应该包含 Assert.Equal(6, scheduler.ErrorRequestsCount); }
/// <summary> /// 房天下 石家庄 新房 /// </summary> /// <param name="args"></param> static void Main(string[] args) { Spider spider = Spider.Create( new QueueDuplicateRemovedScheduler(), new Housing.NewHouse.Fang.sjzHousingPageProcessor()) .AddPipeline(new HousingPipeline()); // Start crawler 启动爬虫 spider.EncodingName = "GBK"; for (int i = 1; i < 16; ++i) { // Add start/feed urls. 添加初始采集链接 spider.AddRequests($"https://sjz.newhouse.fang.com/house/s/b9{i}/"); } spider.Run(); Console.Read(); }
public static void Run() { Instance instance = Instance.LoadFrom("sohu.xml"); var table = new TableInfo("websites", "html"); var fields = new[] { new FieldSelector(".//title", "title"), new FieldSelector(Env.UrlPropertyKey, "url", SelectorType.Enviroment), new FieldSelector(".//body", "content", SelectorType.XPath, DataType.String, int.MaxValue), new FieldSelector("is_match", "is_match", SelectorType.XPath, DataType.Bool), new FieldSelector("matchs", "matchs", SelectorType.XPath, DataType.String, int.MaxValue), new FieldSelector("id", "id", SelectorType.Enviroment, DataType.Int) { IsPrimary = true }, }; var targetRequestSelector = new TargetRequestSelector { XPaths = instance.TargetXpaths, Patterns = instance.TargetPatterns, ExcludePatterns = instance.ExcludePatterns }; var model = new ModelDefinition(null, fields, table, targetRequestSelector); var modeProcessor = new ModelProcessor(model); modeProcessor.CleanPound = true; modeProcessor.AddDataHanlder(new MyDataHandler()); Spider spider = Spider.Create( new QueueDuplicateRemovedScheduler(), modeProcessor) .AddPipeline(new MySqlEntityPipeline()); spider.EncodingName = instance.Encording; spider.AddRequests(instance.Url); if (instance.Downloader.ToLower() == "chrome") { spider.Downloader = new WebDriverDownloader(Browser.Chrome, new Option { Headless = true }); } spider.Run(); }
public static void Run() { // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫 Spider spider = Spider.Create( new QueueDuplicateRemovedScheduler(), new BlogSumaryProcessor(), new NewsProcessor()). AddPipeline(new MyPipeline()); spider.EncodingName = "UTF-8"; for (int i = 1; i < 5; ++i) { // 添加初始采集链接 spider.AddRequests("http://www.cnblogs.com"); } // 启动爬虫 spider.Run(); }
public static void Run() { Spider spider = Spider.Create( // use memoery queue scheduler. 使用内存调度 new QueueDuplicateRemovedScheduler(), // use custmize processor for youku 为优酷自定义的 Processor new YoukuPageProcessor()) // use custmize pipeline for youku 为优酷自定义的 Pipeline .AddPipeline(new YoukuPipeline()); // Start crawler 启动爬虫 spider.EncodingName = "UTF-8"; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始采集链接 spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html"); } spider.Run(); }
public void RunAsyncAndContiune() { if (Environment.GetEnvironmentVariable("TRAVIS") == "1") { return; } Spider spider = Spider.Create(new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.ThreadNum = 1; spider.EncodingName = "UTF-8"; for (int i = 0; i < 10000; i++) { spider.AddRequests("http://www.baidu.com/" + i); } spider.RunAsync(); Thread.Sleep(5000); spider.Pause(() => { spider.Contiune(); }); Thread.Sleep(5000); }
public void ProcesserException() { var scheduler = new QueueDuplicateRemovedScheduler(); Spider spider = Spider.Create( // crawler identity "youku", // use memoery queue scheduler scheduler, // default page processor will save whole html, and extract urls to target urls via regex new TestPageProcessor()) // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd .AddPipeline(new FilePipeline()); spider.ClearSchedulerAfterCompleted = false; // dowload html by http client spider.Downloader = new HttpClientDownloader(); spider.AddHeaders("v.youku.com", new Dictionary <string, object> { { "Upgrade-Insecure-Requests", "1" } }); spider.ThreadNum = 1; // traversal deep 遍历深度 spider.Depth = 3; spider.EmptySleepTime = 6000; spider.AddRequests("http://v.youku.com/v_show/id_XMTMyMTkzNTY1Mg==.html?spm=a2h1n.8251845.0.0"); spider.AddRequests("http://v.youku.com/v_show/id_XMjkzNzMwMDMyOA==.html?spm=a2h1n.8251845.0.0"); spider.AddRequests("http://v.youku.com/v_show/id_XMjcwNDg0NDI3Mg==.html?spm=a2h1n.8251845.0.0"); spider.AddRequests("http://v.youku.com/v_show/id_XMTMwNzQwMTcwMA==.html?spm=a2h1n.8251845.0.0"); spider.AddRequests("http://v.youku.com/v_show/id_XMjk1MzI0Mzk4NA==.html?spm=a2h1n.8251845.0.0"); spider.AddRequests("http://v.youku.com/v_show/id_XMjkzNzY0NzkyOA==.html?spm=a2h1n.8251845.0.0"); spider.AddRequests("http://www.163.com/"); // start crawler 启动爬虫 spider.Run(); Assert.Equal(5, spider.RetriedTimes.Value); Assert.Equal(0, scheduler.LeftRequestsCount); Assert.Equal(6, scheduler.SuccessRequestsCount); Assert.Equal(6, scheduler.ErrorRequestsCount); }
/// <summary> /// 启动 /// </summary> /// <param name="request">抓取请求</param> /// <param name="parser">页面分析器</param> /// <param name="storage">数据存储器,,默认显示在控制台</param> public void Start(List <Request> request, List <IDataFlow> parser, IDataFlow storage = null) { Spider _spider = _provider.Create <Spider>(); if (null == storage) { storage = new ConsoleStorage(); } _spider.NewGuidId(); // 设置任务标识 _spider.Name = "测试采集"; // 设置任务名称 _spider.Speed = 10; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. _spider.Depth = 3; // 设置采集深度 if (parser != null) { foreach (IDataFlow item in parser) { _spider.AddDataFlow(item); } } _spider.AddDataFlow(storage); _spider.AddRequests(request.ToArray()); // 设置链接 _spider.RunAsync(); // 启动 }