public static void Run() { // 注入监控服务 IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>(); // 定义要采集的 Site 对象, 可以设置 Header、Cookie、代理等 var site = new Site { EncodingName = "UTF-8" }; for (int i = 1; i < 5; ++i) { // 添加初始采集链接 site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html"); } // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫 Spider spider = Spider.Create(site, new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1); spider.EmptySleepTime = 3000; // 注册爬虫到监控服务 SpiderMonitor.Register(spider); // 启动爬虫 spider.Run(); Console.Read(); }
public void DatebaseLogAndStatus() { IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>(); string id = Guid.NewGuid().ToString("N"); string taskGroup = Guid.NewGuid().ToString("N"); string userId = Guid.NewGuid().ToString("N"); string connectString = "Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"; Configuration.SetValue("logAndStatusConnectString", connectString); Assert.Equal("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306", Configuration.GetValue("logAndStatusConnectString")); LogManagerHelper.InitLogManager(true); using (Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", MinSleepTime = 1000 }, id, userId, taskGroup, new TestPageProcessor(), new QueueDuplicateRemovedScheduler())) { spider.AddPipeline(new TestPipeline()).SetThreadNum(1); for (int i = 0; i < 5; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } SpiderMonitor.Register(spider); spider.Run(); } using (MySqlConnection conn = new MySqlConnection(connectString)) { Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.status where userid='{userId}' and taskgroup='{taskGroup}' and identity='{id}'").First().Count); Assert.Equal(7, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.log where userid='{userId}' and taskgroup='{taskGroup}' and identity='{id}'").First().Count); } }
public static void Run() { Site site = new Site(); site.AddStartUrl("http://www.36kr.com/"); Core.Spider thread = OoSpider.Create(site, new CollectorPageModelToDbPipeline(), typeof(Kr36NewsModel)).SetThreadNum(20); thread.Start(); SpiderMonitor spiderMonitor = SpiderMonitor.Instance; spiderMonitor.Register(thread); }
public static void Run() { IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>(); var site = new Site { EncodingName = "UTF-8" }; for (int i = 1; i < 5; ++i) { site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html"); } Spider spider = Spider.Create(site, new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1); SpiderMonitor.Register(spider); spider.Run(); Console.Read(); }
public override void Run(params string[] arguments) { InitEnvorimentAndVerify(); try { #if !NET_CORE if (CookieInterceptor != null) { Logger.SaveLog(LogInfo.Create("尝试获取 Cookie...", Logger.Name, this, LogLevel.Info)); var cookie = CookieInterceptor.GetCookie(); if (cookie == null) { Logger.SaveLog(LogInfo.Create("获取 Cookie 失败, 爬虫无法继续.", Logger.Name, this, LogLevel.Error)); return; } else { Site.CookiesStringPart = cookie.CookiesStringPart; Site.Cookies = cookie.CookiesDictionary; } } #endif Logger.SaveLog(LogInfo.Create("创建爬虫...", Logger.Name, this, LogLevel.Info)); EntityProcessor processor = new EntityProcessor(this); foreach (var entity in Entities) { processor.AddEntity(entity); } PageProcessor = processor; foreach (var entity in Entities) { string entiyName = entity.Entity.Name; var pipelines = new List <BaseEntityPipeline>(); foreach (var pipeline in EntityPipelines) { var newPipeline = pipeline.Clone(); newPipeline.InitiEntity(entity); if (newPipeline.IsEnabled) { pipelines.Add(newPipeline); } } if (pipelines.Count > 0) { Pipelines.Add(new EntityPipeline(entiyName, pipelines)); } } CheckIfSettingsCorrect(); bool needInitStartRequest = true; string key = "locker-" + Identity; if (Db != null) { while (!Db.LockTake(key, "0", TimeSpan.FromMinutes(10))) { Thread.Sleep(1000); } var lockerValue = Db.HashGet(InitStatusSetName, Identity); needInitStartRequest = lockerValue != "init finished"; } if (arguments.Contains("rerun")) { Scheduler.Init(this); Scheduler.Clear(); //DELETE verify record. Db?.HashDelete(ValidateStatusName, Identity); needInitStartRequest = true; } Logger.SaveLog(LogInfo.Create("构建内部模块、准备爬虫数据...", Logger.Name, this, LogLevel.Info)); InitComponent(); if (needInitStartRequest) { if (PrepareStartUrls != null) { for (int i = 0; i < PrepareStartUrls.Length; ++i) { var prepareStartUrl = PrepareStartUrls[i]; Logger.SaveLog(LogInfo.Create($"[步骤 {i + 2}] 添加链接到调度中心.", Logger.Name, this, LogLevel.Info)); prepareStartUrl.Build(this, null); } } } SpiderMonitor.Register(this); Db?.LockRelease(key, 0); RegisterControl(this); if (!arguments.Contains("running-test")) { base.Run(); } else { IsExited = true; } TaskFinished(); HandleVerifyCollectData(); } finally { Dispose(); SpiderMonitor.Dispose(); } }