Exemple #1
0
        public void Run()
        {
            CheckIfRunning();

            Stat     = Status.Running;
            IsExited = false;

#if !NET_CORE
            // 开启多线程支持
            System.Net.ServicePointManager.DefaultConnectionLimit = 1000;
#endif

            InitComponent();

            IMonitorableScheduler monitor = (IMonitorableScheduler)Scheduler;

            if (StartTime == DateTime.MinValue)
            {
                StartTime = DateTime.Now;
            }

            Parallel.For(0, ThreadNum, new ParallelOptions
            {
                MaxDegreeOfParallelism = ThreadNum
            }, i =>
            {
                int waitCount  = 0;
                bool firstTask = false;

                var downloader = Downloader.Clone();

                while (Stat == Status.Running)
                {
                    Request request = Scheduler.Poll(this);

                    if (request == null)
                    {
                        if (waitCount > _waitCountLimit && IsExitWhenComplete)
                        {
                            Stat = Status.Finished;
                            break;
                        }

                        // wait until new url added
                        WaitNewUrl(ref waitCount);
                    }
                    else
                    {
                        Log.WriteLine($"Left: {monitor.GetLeftRequestsCount(this)} Total: {monitor.GetTotalRequestsCount(this)} Thread: {ThreadNum}");

                        waitCount = 0;

                        try
                        {
                            ProcessRequest(request, downloader);
                            Thread.Sleep(Site.SleepTime);
#if TEST
                            System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
                            sw.Reset();
                            sw.Start();
#endif

                            OnSuccess(request);
#if TEST
                            sw.Stop();
                            Console.WriteLine("OnSuccess:" + (sw.ElapsedMilliseconds).ToString());
#endif
                        }
                        catch (Exception e)
                        {
                            OnError(request);
                            Logger.Error("采集失败: " + request.Url + ".", e);
                        }
                        finally
                        {
#if !NET_CORE
                            if (Site.HttpProxyPoolEnable && request.GetExtra(Request.Proxy) != null)
                            {
                                Site.ReturnHttpProxyToPool((HttpHost)request.GetExtra(Request.Proxy), (int)request.GetExtra(Request.StatusCode));
                            }
#endif
                            FinishedPageCount.Inc();
                        }

                        if (!firstTask)
                        {
                            Thread.Sleep(3000);
                            firstTask = true;
                        }
                    }
                }
            });

            FinishedTime = DateTime.Now;

            foreach (IPipeline pipeline in Pipelines)
            {
                SafeDestroy(pipeline);
            }

            if (Stat == Status.Finished)
            {
                OnClose();

                Logger.Info($"任务 {Identity} 结束.");
            }

            if (Stat == Status.Stopped)
            {
                Logger.Info("任务 " + Identity + " 停止成功!");
            }

            SpiderClosingEvent?.Invoke();

            Log.WaitForExit();

            if (Stat == Status.Exited)
            {
                Logger.Info("任务 " + Identity + " 退出成功!");
            }

            IsExited = true;
        }
Exemple #2
0
        public void Run()
        {
            //Stopwatch watch = new Stopwatch();
            //watch.Start();

            // 必须开启多线程限制
            System.Net.ServicePointManager.DefaultConnectionLimit = int.MaxValue;

            CheckRunningStat();

            Logger.Info("Spider " + Identify + " InitComponent...");
            InitComponent();

            //IMonitorableScheduler monitor = (IMonitorableScheduler)Scheduler;

            Logger.Info("Spider " + Identify + " Started!");

            bool firstTask = false;

            while (Stat.Value == StatRunning)
            {
                Request request = Scheduler.Poll(this);

                if (request == null)
                {
                    if (ThreadPool.GetThreadAlive() == 0 && ExitWhenComplete)
                    {
                        break;
                    }

                    if (_waitCount > _waitCountLimit)
                    {
                        break;
                    }

                    // wait until new url added
                    WaitNewUrl();
                }
                else
                {
                    if (_startTime == DateTime.MinValue)
                    {
                        _startTime = DateTime.Now;
                    }

                    _waitCount = 0;

                    ThreadPool.Execute((obj, cts) =>
                    {
                        //Logger.Info(
                        //	$"Left: {monitor.GetLeftRequestsCount(this)} Total: {monitor.GetTotalRequestsCount(this)} AliveThread: {ThreadPool.GetThreadAlive()} ThreadNum: {ThreadPool.GetThreadNum()}");
                        var request1 = obj as Request;
                        if (request1 != null)
                        {
                            try
                            {
                                ProcessRequest(request1, cts);
                                OnSuccess(request1);
                                Uri uri = new Uri(request1.Url);
                                Logger.Info($"Request: { HttpUtility.HtmlDecode(HttpUtility.UrlDecode(uri.Query))} Sucess.");
                                return(1);
                            }
                            catch (Exception e)
                            {
                                OnError(request1);
                                Logger.Error("Request " + request1.Url + " failed.", e);
                                return(-1);
                            }
                            finally
                            {
                                if (_site.GetHttpProxyPool().Enable)
                                {
                                    _site.ReturnHttpProxyToPool((HttpHost)request1.GetExtra(Request.Proxy), (int)request1.GetExtra(Request.StatusCode));
                                }
                                _pageCount.Inc();
                            }
                        }

                        return(0);
                    }, request);

                    if (!firstTask)
                    {
                        Thread.Sleep(3000);
                        firstTask = true;
                    }
                }
            }

            ThreadPool.WaitToEnd();

            // release some resources
            if (DestroyWhenExit)
            {
                Close();
            }

            _endTime = DateTime.Now;

            OnClose();

            //watch.Stop();

            //Logger.Info("Cost time:" + (float)watch.ElapsedMilliseconds / 1000);
            Stat.Set(StatFinished);
        }
Exemple #3
0
        public void Run()
        {
            CheckIfRunning();

            Stat         = Status.Running;
            _runningExit = false;

#if !NET_CORE
            // 必须开启多线程限制
            System.Net.ServicePointManager.DefaultConnectionLimit = int.MaxValue;
#endif
            Logger.Info("Spider " + Identity + " InitComponent...");
            InitComponent();

            Logger.Info("Spider " + Identity + " Started!");

            bool firstTask = false;

            while (Stat == Status.Running)
            {
                Request request = Scheduler.Poll(this);

                if (request == null)
                {
                    if (ThreadPool.ThreadAlive == 0)
                    {
                        if (_waitCount > _waitCountLimit && IsExitWhenComplete)
                        {
                            Stat = Status.Finished;
                            break;
                        }
                    }

                    // wait until new url added
                    WaitNewUrl();
                }
                else
                {
                    if (StartTime == DateTime.MinValue)
                    {
                        StartTime = DateTime.Now;
                    }

                    _waitCount = 0;

                    ThreadPool.Push(obj =>
                    {
                        var request1 = obj as Request;
                        if (request1 != null)
                        {
                            try
                            {
                                ProcessRequest(request1);

                                Thread.Sleep(Site.SleepTime);

                                OnSuccess(request1);
                            }
                            catch (Exception e)
                            {
                                OnError(request1);
                                Logger.Error("Request " + request1.Url + " failed.", e);
                            }
                            finally
                            {
#if !NET_CORE
                                if (Site.HttpProxyPoolEnable)
                                {
                                    Site.ReturnHttpProxyToPool((HttpHost)request1.GetExtra(Request.Proxy), (int)request1.GetExtra(Request.StatusCode));
                                }
#endif
                                FinishedPageCount.Inc();
                            }
                            return(true);
                        }

                        return(false);
                    }, request);

                    if (!firstTask)
                    {
                        Thread.Sleep(3000);
                        firstTask = true;
                    }
                }
            }

            _waitingToExit = true;

            ThreadPool.WaitToExit();
            FinishedTime = DateTime.Now;

            // Pipeline中有可能有缓存数据, 需要清理/保存后才能安全退出/暂停
            foreach (IPipeline pipeline in Pipelines)
            {
                SafeDestroy(pipeline);
            }

            if (Stat == Status.Finished)
            {
                OnClose();
            }

            if (Stat == Status.Stopped)
            {
                Logger.Info("Spider " + Identity + " stop success!");
            }

            _runningExit = true;
        }