protected virtual void InitComponent(params string[] arguments) { if (_init) { return; } this.Log("构建内部模块、准备爬虫数据...", LogLevel.Info); if (Pipelines == null || Pipelines.Count == 0) { throw new SpiderException("Pipelines should not be null."); } PreInitComponent(arguments); _monitor = IocManager.Resolve <IMonitor>() ?? new NLogMonitor(); if (CookieInjector != null) { CookieInjector.Inject(this, false); } Scheduler.Init(this); _monitorTask = Task.Factory.StartNew(() => { while (!Monitorable.IsExited) { ReportStatus(); Thread.Sleep(2000); } ReportStatus(); }); #if !NET_CORE _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); #else _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif Console.CancelKeyPress += ConsoleCancelKeyPress; foreach (var pipeline in Pipelines) { pipeline.InitPipeline(this); } if (Site.StartRequests != null && Site.StartRequests.Count > 0) { this.Log($"准备步骤: 添加链接到调度中心, 数量 {Site.StartRequests.Count}.", LogLevel.Info); //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info)); if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler)) { Parallel.ForEach(Site.StartRequests, new ParallelOptions() { MaxDegreeOfParallelism = 4 }, request => { Scheduler.Push(request); }); } else { Scheduler.Import(new HashSet <Request>(Site.StartRequests)); ClearStartRequests(); } } else { this.Log("准备步骤: 添加链接到调度中心, 数量 0.", LogLevel.Info); } _waitCountLimit = EmptySleepTime / WaitInterval; AfterInitComponent(arguments); _init = true; }
/// <summary> /// 下载工作的具体实现 /// </summary> /// <param name="request">请求信息</param> /// <param name="spider">爬虫</param> /// <returns>页面数据</returns> protected override Task <Page> DowloadContent(Request request, ISpider spider) { Site site = spider.Site; try { lock (_locker) { if (_webDriver == null) { _webDriver = WebDriverUtil.Open(_browser, _option); if (_domains != null) { foreach (var domain in _domains) { var cookies = CookieContainer.GetCookies(new Uri(domain)); foreach (System.Net.Cookie cookie in cookies) { AddCookieToDownloadClient(cookie); } } } if (!_isLogined && CookieInjector != null) { var webdriverLoginHandler = CookieInjector as WebDriverLoginHandler; if (webdriverLoginHandler != null) { webdriverLoginHandler.Driver = _webDriver as RemoteWebDriver; } CookieInjector.Inject(this, spider); _isLogined = true; } } } //#if NET_CORE // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#else // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#endif // string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}"; var domainUrl = $"{request.Uri.Scheme}://{request.Uri.DnsSafeHost}{(request.Uri.Port == 80 ? "" : ":" + request.Uri.Port)}"; string realUrl = request.Url.ToString(); NetworkCenter.Current.Execute("webdriver-download", () => { _webDriver.Navigate().GoToUrl(realUrl); if (WebDriverHandlers != null) { foreach (var handler in WebDriverHandlers) { handler.Handle((RemoteWebDriver)_webDriver); } } }); Thread.Sleep(_webDriverWaitTime); Page page = new Page(request) { Content = _webDriver.PageSource, TargetUrl = _webDriver.Url }; return(Task.FromResult(page)); } catch (DownloadException de) { Page page = new Page(request) { Exception = de }; if (site.CycleRetryTimes > 0) { page = site.AddToCycleRetry(request); } spider.Logger.Error($"下载 {request.Url} 失败: {de.Message}."); return(Task.FromResult(page)); } catch (Exception e) { spider.Logger.Error($"下载 {request.Url} 失败: {e.Message}."); Page page = new Page(request) { Exception = e }; return(Task.FromResult(page)); } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; try { lock (_locker) { _webDriver = _webDriver ?? WebDriverExtensions.Open(_browser, _option); foreach (var domain in _domains) { var cookies = _cookieContainer.GetCookies(new Uri(domain)); foreach (System.Net.Cookie cookie in cookies) { AddCookieToDownloadClient(cookie); } } if (!_isLogined && CookieInjector != null) { var webdriverLoginHandler = CookieInjector as WebDriverLoginHandler; if (webdriverLoginHandler != null) { webdriverLoginHandler.Driver = _webDriver as RemoteWebDriver; } CookieInjector.Inject(this, spider); _isLogined = true; } } //#if NET_CORE // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#else // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#endif // string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}"; var domainUrl = $"{request.Uri.Scheme}://{request.Uri.DnsSafeHost}{(request.Uri.Port == 80 ? "" : ":" + request.Uri.Port)}"; // TODO:重新实现WebDriverDownloader设置Cookie //var options = _webDriver.Manage(); //if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies?.PairPart.Count > 0) //{ // _webDriver.Url = domainUrl; // options.Cookies.DeleteAllCookies(); // if (spider.Site.Cookies != null) // { // foreach (var c in spider.Site.Cookies.PairPart) // { // options.Cookies.AddCookie(new Cookie(c.Key, c.Value)); // } // } //} string realUrl = request.Url.ToString(); NetworkCenter.Current.Execute("webdriver-download", () => { _webDriver.Navigate().GoToUrl(realUrl); NavigateCompeleted?.Invoke((RemoteWebDriver)_webDriver); }); Thread.Sleep(_webDriverWaitTime); Page page = new Page(request) { Content = _webDriver.PageSource, TargetUrl = _webDriver.Url }; // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 //request.PutExtra(Request.CycleTriedTimes, null); return(page); } catch (DownloadException de) { Page page = new Page(request) { Exception = de }; if (site.CycleRetryTimes > 0) { page = Spider.AddToCycleRetry(request, site); } Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", Level.Warn); return(page); } catch (Exception e) { Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", Level.Warn); Page page = new Page(request) { Exception = e }; return(page); } }