/// <summary> /// 初始化爬虫 /// </summary> /// <exception cref="ArgumentNullException">Scheduler is not null</exception> private void InitSpider() { CheckRunning(); if (Scheduler == null) { throw new ArgumentNullException($"Scheduler is not null"); } if (DownLoader == null) { throw new ArgumentNullException($"DownLoader is not null"); } if (PageProcessor == null) { throw new ArgumentNullException($"PageProcessormo is not null"); } if (Site.MinSleepTime < 500) { throw new SpiderExceptoin("Sleep time should be large than 500"); } if (ThreadCount <= 0) { throw new ArgumentNullException($"ThreadCount should be more than one!"); } //http并发请求限制 ServicePointManager.DefaultConnectionLimit = ThreadCount > 1024 ? ThreadCount : 1024; SpiderListening.ForEach(item => item.AfterInit(this)); }
/// <summary> /// 运行爬虫 /// </summary> public void Run() { InitSpider(); Status = SpiderStatusEnum.Running; var parallelOptions = new ParallelOptions { MaxDegreeOfParallelism = ThreadCount }; Parallel.For(0, ThreadCount, parallelOptions, i => { while (Status == SpiderStatusEnum.Running) { Request requset = null; try { requset = Scheduler.GetRequest(); if (requset == null) { break; } ProcessRequest(requset, DownLoader); Thread.Sleep(_random.Next(Site.MinSleepTime, Site.MaxSleepTime)); } catch (Exception e) { SpiderListening.ForEach(item => item.ErrorHandler(requset, e)); } } }); }
/// <summary> /// 页面下载 /// </summary> private void ProcessRequest(Request request, IDownLoader downLoader) { var page = downLoader.DownLoader(request, this); PageProcessor.Process(page); Scheduler.AddFinishRequest(request); SpiderListening.ForEach(item => item.AfterSuccess(request)); if (page.IsSave) { Pipelines.ForEach(item => item.Process(page.PageResult)); } GetPageUrl(page).ForEach(item => Scheduler.AddWaitRequest(new Request(item))); }