Exemplo n.º 1
0
        /// <summary>
        /// Init component of spider.
        /// </summary>
        /// <param name="arguments"></param>
        protected virtual void InitComponents(params string[] arguments)
        {
            PrintInfo.Print();

            Logger.Information("Build internal component...");

#if !NETSTANDARD // 开启多线程支持
            ServicePointManager.DefaultConnectionLimit = 1000;
#endif

            InitSite();

            InitDownloader();

            InitScheduler(arguments);

            if (_pageProcessors == null || _pageProcessors.Count == 0)
            {
                throw new SpiderException("Count of PageProcessor is zero");
            }

            InitPipelines(arguments);

            InitCloseSignals();

            InitMonitor();

            InitErrorRequestsLog();

            BuildStartUrlBuilders(arguments);

            PushStartRequestsToScheduler();

            _monitorReportInterval = CalculateMonitorReportInterval();

            if (Console.IsInputRedirected)
            {
                Console.CancelKeyPress += ConsoleCancelKeyPress;
            }

            _waitCountLimit = EmptySleepTime / WaitInterval;

            _inited = true;
        }
Exemplo n.º 2
0
        protected override void Execute(params string[] arguments)
        {
            ValidateSettings();

            if (_inited || Status == Status.Running)
            {
                Logger.Warning("Crawler is running...");
                return;
            }

            InitComponents(arguments);

            if (arguments.Any(a => a?.ToLower() == "notrealrun"))
            {
                return;
            }

            StartTime = DateTime.Now;
            Status    = Status.Running;
            _exited   = false;

            ReportStatus();

            while (Status == Status.Running || Status == Status.Paused)
            {
                // 暂停则一直停在此处
                if (Status == Status.Paused)
                {
                    Thread.Sleep(50);
                    continue;
                }

                Parallel.For(0, ThreadNum, new ParallelOptions
                {
                    MaxDegreeOfParallelism = ThreadNum
                }, i =>
                {
                    int waitCount = 1;
                    // 每个线程使用一个下载器实例, 在使用如WebDriverDownloader时不需要管理WebDriver实例了
                    var downloader = Downloader.Clone();
                    while (Status == Status.Running)
                    {
                        // 从队列中取出一个请求
                        Request request = Scheduler.Poll();

                        // 如果队列中没有需要处理的请求, 则开始等待, 一直到设定的 EmptySleepTime 结束, 则认为爬虫应该结束了
                        if (request == null)
                        {
                            if (waitCount > _waitCountLimit && ExitWhenComplete)
                            {
                                Status = Status.Finished;
                                OnCompleted?.Invoke(this);
                                break;
                            }

                            // wait until new url added
                            WaitNewUrl(ref waitCount);
                        }
                        else
                        {
                            waitCount = 1;

                            try
                            {
                                Stopwatch sw = new Stopwatch();
                                HandleRequest(sw, request, downloader);
                                Thread.Sleep(Site.SleepTime);
                            }
                            catch (Exception e)
                            {
                                OnError(request);
                                Logger.Error($"Crawler {request.Url} failed: {e}.");
                            }
                            finally
                            {
                                if (request.Proxy != null)
                                {
                                    var statusCode = request.StatusCode;
                                    Site.HttpProxyPool.ReturnProxy(request.Proxy, statusCode ?? HttpStatusCode.Found);
                                }

                                _requestedCount.Inc();

                                if (_requestedCount.Value % _monitorReportInterval == 0)
                                {
                                    ReportStatus();
                                    CheckExitSignal();
                                }
                            }
                        }
                    }

                    SafeDestroy(downloader);
                });
            }
            string msg = Status != Status.Finished ? "Crawl terminated" : "Crawl complete";

            EndTime = DateTime.Now;

            ReportStatus();
            OnClose();

            Logger.Information($"{msg}, cost: {(EndTime - StartTime).TotalSeconds} seconds.");
            PrintInfo.PrintLine();

            OnClosed?.Invoke(this);
            _exited = true;
        }