/// <summary> /// Init component of spider. /// </summary> /// <param name="arguments"></param> protected virtual void InitComponents(params string[] arguments) { PrintInfo.Print(); Logger.Information("Build internal component..."); #if !NETSTANDARD // 开启多线程支持 ServicePointManager.DefaultConnectionLimit = 1000; #endif InitSite(); InitDownloader(); InitScheduler(arguments); if (_pageProcessors == null || _pageProcessors.Count == 0) { throw new SpiderException("Count of PageProcessor is zero"); } InitPipelines(arguments); InitCloseSignals(); InitMonitor(); InitErrorRequestsLog(); BuildStartUrlBuilders(arguments); PushStartRequestsToScheduler(); _monitorReportInterval = CalculateMonitorReportInterval(); if (Console.IsInputRedirected) { Console.CancelKeyPress += ConsoleCancelKeyPress; } _waitCountLimit = EmptySleepTime / WaitInterval; _inited = true; }
protected override void Execute(params string[] arguments) { ValidateSettings(); if (_inited || Status == Status.Running) { Logger.Warning("Crawler is running..."); return; } InitComponents(arguments); if (arguments.Any(a => a?.ToLower() == "notrealrun")) { return; } StartTime = DateTime.Now; Status = Status.Running; _exited = false; ReportStatus(); while (Status == Status.Running || Status == Status.Paused) { // 暂停则一直停在此处 if (Status == Status.Paused) { Thread.Sleep(50); continue; } Parallel.For(0, ThreadNum, new ParallelOptions { MaxDegreeOfParallelism = ThreadNum }, i => { int waitCount = 1; // 每个线程使用一个下载器实例, 在使用如WebDriverDownloader时不需要管理WebDriver实例了 var downloader = Downloader.Clone(); while (Status == Status.Running) { // 从队列中取出一个请求 Request request = Scheduler.Poll(); // 如果队列中没有需要处理的请求, 则开始等待, 一直到设定的 EmptySleepTime 结束, 则认为爬虫应该结束了 if (request == null) { if (waitCount > _waitCountLimit && ExitWhenComplete) { Status = Status.Finished; OnCompleted?.Invoke(this); break; } // wait until new url added WaitNewUrl(ref waitCount); } else { waitCount = 1; try { Stopwatch sw = new Stopwatch(); HandleRequest(sw, request, downloader); Thread.Sleep(Site.SleepTime); } catch (Exception e) { OnError(request); Logger.Error($"Crawler {request.Url} failed: {e}."); } finally { if (request.Proxy != null) { var statusCode = request.StatusCode; Site.HttpProxyPool.ReturnProxy(request.Proxy, statusCode ?? HttpStatusCode.Found); } _requestedCount.Inc(); if (_requestedCount.Value % _monitorReportInterval == 0) { ReportStatus(); CheckExitSignal(); } } } } SafeDestroy(downloader); }); } string msg = Status != Status.Finished ? "Crawl terminated" : "Crawl complete"; EndTime = DateTime.Now; ReportStatus(); OnClose(); Logger.Information($"{msg}, cost: {(EndTime - StartTime).TotalSeconds} seconds."); PrintInfo.PrintLine(); OnClosed?.Invoke(this); _exited = true; }