Exemple #1
0
        protected virtual void InitComponent(params string[] arguments)
        {
            if (_init)
            {
                return;
            }

            this.Log("构建内部模块、准备爬虫数据...", LogLevel.Info);

            if (Pipelines == null || Pipelines.Count == 0)
            {
                throw new SpiderException("Pipelines should not be null.");
            }

            PreInitComponent(arguments);

            _monitor = IocManager.Resolve <IMonitor>() ?? new NLogMonitor();

            if (CookieInjector != null)
            {
                CookieInjector.Inject(this, false);
            }

            Scheduler.Init(this);

            _monitorTask = Task.Factory.StartNew(() =>
            {
                while (!Monitorable.IsExited)
                {
                    ReportStatus();
                    Thread.Sleep(2000);
                }
                ReportStatus();
            });

#if !NET_CORE
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
#else
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
            Console.CancelKeyPress += ConsoleCancelKeyPress;

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }

            if (Site.StartRequests != null && Site.StartRequests.Count > 0)
            {
                this.Log($"准备步骤: 添加链接到调度中心, 数量 {Site.StartRequests.Count}.", LogLevel.Info);
                //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info));
                if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                {
                    Parallel.ForEach(Site.StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 4
                    }, request =>
                    {
                        Scheduler.Push(request);
                    });
                }
                else
                {
                    Scheduler.Import(new HashSet <Request>(Site.StartRequests));
                    ClearStartRequests();
                }
            }
            else
            {
                this.Log("准备步骤: 添加链接到调度中心, 数量 0.", LogLevel.Info);
            }

            _waitCountLimit = EmptySleepTime / WaitInterval;

            AfterInitComponent(arguments);

            _init = true;
        }
        /// <summary>
        /// 下载工作的具体实现
        /// </summary>
        /// <param name="request">请求信息</param>
        /// <param name="spider">爬虫</param>
        /// <returns>页面数据</returns>
        protected override Task <Page> DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (_locker)
                {
                    if (_webDriver == null)
                    {
                        _webDriver = WebDriverUtil.Open(_browser, _option);

                        if (_domains != null)
                        {
                            foreach (var domain in _domains)
                            {
                                var cookies = CookieContainer.GetCookies(new Uri(domain));
                                foreach (System.Net.Cookie cookie in cookies)
                                {
                                    AddCookieToDownloadClient(cookie);
                                }
                            }
                        }

                        if (!_isLogined && CookieInjector != null)
                        {
                            var webdriverLoginHandler = CookieInjector as WebDriverLoginHandler;
                            if (webdriverLoginHandler != null)
                            {
                                webdriverLoginHandler.Driver = _webDriver as RemoteWebDriver;
                            }
                            CookieInjector.Inject(this, spider);
                            _isLogined = true;
                        }
                    }
                }

                //#if NET_CORE
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#else
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#endif
                //				string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}";

                var domainUrl = $"{request.Uri.Scheme}://{request.Uri.DnsSafeHost}{(request.Uri.Port == 80 ? "" : ":" + request.Uri.Port)}";

                string realUrl = request.Url.ToString();

                NetworkCenter.Current.Execute("webdriver-download", () =>
                {
                    _webDriver.Navigate().GoToUrl(realUrl);

                    if (WebDriverHandlers != null)
                    {
                        foreach (var handler in WebDriverHandlers)
                        {
                            handler.Handle((RemoteWebDriver)_webDriver);
                        }
                    }
                });

                Thread.Sleep(_webDriverWaitTime);

                Page page = new Page(request)
                {
                    Content   = _webDriver.PageSource,
                    TargetUrl = _webDriver.Url
                };

                return(Task.FromResult(page));
            }
            catch (DownloadException de)
            {
                Page page = new Page(request)
                {
                    Exception = de
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = site.AddToCycleRetry(request);
                }
                spider.Logger.Error($"下载 {request.Url} 失败: {de.Message}.");
                return(Task.FromResult(page));
            }
            catch (Exception e)
            {
                spider.Logger.Error($"下载 {request.Url} 失败: {e.Message}.");
                Page page = new Page(request)
                {
                    Exception = e
                };
                return(Task.FromResult(page));
            }
        }
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (_locker)
                {
                    _webDriver = _webDriver ?? WebDriverExtensions.Open(_browser, _option);

                    foreach (var domain in _domains)
                    {
                        var cookies = _cookieContainer.GetCookies(new Uri(domain));
                        foreach (System.Net.Cookie cookie in cookies)
                        {
                            AddCookieToDownloadClient(cookie);
                        }
                    }

                    if (!_isLogined && CookieInjector != null)
                    {
                        var webdriverLoginHandler = CookieInjector as WebDriverLoginHandler;
                        if (webdriverLoginHandler != null)
                        {
                            webdriverLoginHandler.Driver = _webDriver as RemoteWebDriver;
                        }
                        CookieInjector.Inject(this, spider);
                        _isLogined = true;
                    }
                }

                //#if NET_CORE
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#else
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#endif
                //				string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}";

                var domainUrl = $"{request.Uri.Scheme}://{request.Uri.DnsSafeHost}{(request.Uri.Port == 80 ? "" : ":" + request.Uri.Port)}";

                // TODO:重新实现WebDriverDownloader设置Cookie
                //var options = _webDriver.Manage();
                //if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies?.PairPart.Count > 0)
                //{
                //	_webDriver.Url = domainUrl;
                //	options.Cookies.DeleteAllCookies();
                //	if (spider.Site.Cookies != null)
                //	{
                //		foreach (var c in spider.Site.Cookies.PairPart)
                //		{
                //			options.Cookies.AddCookie(new Cookie(c.Key, c.Value));
                //		}
                //	}
                //}

                string realUrl = request.Url.ToString();

                NetworkCenter.Current.Execute("webdriver-download", () =>
                {
                    _webDriver.Navigate().GoToUrl(realUrl);

                    NavigateCompeleted?.Invoke((RemoteWebDriver)_webDriver);
                });

                Thread.Sleep(_webDriverWaitTime);

                Page page = new Page(request)
                {
                    Content   = _webDriver.PageSource,
                    TargetUrl = _webDriver.Url
                };

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                //request.PutExtra(Request.CycleTriedTimes, null);
                return(page);
            }
            catch (DownloadException de)
            {
                Page page = new Page(request)
                {
                    Exception = de
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = Spider.AddToCycleRetry(request, site);
                }
                Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", Level.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", Level.Warn);
                Page page = new Page(request)
                {
                    Exception = e
                };
                return(page);
            }
        }