private Page CreateRetryPage(Exception e, Request request, ISpider spider) { Page page = spider.Site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, spider.Site) : new Page(request); if (page != null) { page.Exception = e; } Logger.Log(spider.Identity, $"Download {request.Url} failed: {e.Message}.", Level.Warn); return(page); }
public override void Handle(ref Page page, ISpider spider) { if (page != null && !string.IsNullOrEmpty(page.Content) && !string.IsNullOrEmpty(Content) && page.Content.Contains(Content)) { if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { Logger.MyLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error); spider.Exit(); } page = Spider.AddToCycleRetry(page.Request, spider.Site); page.Exception = new DownloadException($"Content downloaded contains string: {Content}."); } }
public override void Handle(ref Page page, ISpider spider) { if (page != null && !string.IsNullOrEmpty(page.Content) && !string.IsNullOrEmpty(Content) && CookieInjector != null && page.Content.Contains(Content)) { if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { spider.Exit(); } Spider.AddToCycleRetry(page.Request, spider.Site); CookieInjector?.Inject(spider); page.Exception = new DownloadException($"Content downloaded contains string: {Content}."); } }
/// <summary> /// 当页面数据中的异常信息包含指定内容时触发ADSL拨号 /// </summary> /// <param name="page">页面数据</param> /// <param name="spider">爬虫</param> public override void Handle(ref Page page, ISpider spider) { if (page == null || string.IsNullOrEmpty(page.Content) || string.IsNullOrWhiteSpace(page.Content) || page.Exception == null) { return; } if (page.Exception.Message.Contains(_exceptionMessage)) { if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { Logger.AllLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error); spider.Exit(); } Spider.AddToCycleRetry(page.Request, spider.Site); page.Exception = new DownloadException("Download failed and redial finished already."); } }
/// <summary> /// 当页面数据包含指定内容时触发ADSL拨号, 并且重新获取Cookie /// </summary> /// <param name="page">页面数据</param> /// <param name="spider">爬虫</param> public override void Handle(ref Page page, ISpider spider) { if (!string.IsNullOrEmpty(page?.Content)) { var content = page.Content; var containContent = _contents.FirstOrDefault(c => content.Contains(c)); if (containContent != null) { if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { spider.Exit(); } Spider.AddToCycleRetry(page.Request, spider.Site); _cookieInjector.Inject(spider); page.Exception = new DownloadException($"Downloaded content contains: {containContent}."); } } }
/// <summary> /// 当包含指定内容时触发ADSL拨号 /// </summary> /// <param name="page">页面数据</param> /// <param name="spider">爬虫</param> public override void Handle(ref Page page, ISpider spider) { if (!string.IsNullOrEmpty(page?.Content)) { var content = page.Content; var containContent = _contents.FirstOrDefault(c => content.Contains(c)); if (containContent != null) { if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { Logger.AllLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error); spider.Exit(); } page = Spider.AddToCycleRetry(page.Request, spider.Site); page.Exception = new DownloadException($"Downloaded content contains: {containContent}."); } } }
public override void Handle(ref Page page, ISpider spider) { if (RedialLimit != 0) { lock (_locker) { ++RequestedCount; if (RedialLimit > 0 && RequestedCount == RedialLimit) { RequestedCount = 0; Spider.AddToCycleRetry(page.Request, spider.Site); if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { spider.Exit(); } } } } }
public override void Handle(ref Page page, ISpider spider) { if (page != null && !string.IsNullOrEmpty(page.Content) && !string.IsNullOrEmpty(ExceptionMessage) && page.Exception != null) { if (string.IsNullOrEmpty(ExceptionMessage)) { page.Exception = new SpiderException("ExceptionMessage should not be empty/null."); } if (page.Exception.Message.Contains(ExceptionMessage)) { if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { Logger.MyLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error); spider.Exit(); } Spider.AddToCycleRetry(page.Request, spider.Site); page.Exception = new DownloadException("Download failed and redial finished already."); } } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; HttpResponseMessage response = null; var proxy = site.GetHttpProxy(); request.PutExtra(Request.Proxy, proxy); try { var httpMessage = GenerateHttpRequestMessage(request, site); response = NetworkCenter.Current.Execute("http", message => { HttpClient httpClient = _httpClientPool.GetHttpClient(proxy); var requestTask = httpClient.SendAsync(message); requestTask.Wait(site.Timeout); if (requestTask.Status == TaskStatus.RanToCompletion) { return(requestTask.Result); } else { return(new HttpResponseMessage(HttpStatusCode.RequestTimeout)); } }, httpMessage); request.StatusCode = response.StatusCode; response.EnsureSuccessStatusCode(); if (!site.AcceptStatCode.Contains(response.StatusCode)) { throw new DownloadException($"下载 {request.Url} 失败. Code {response.StatusCode}"); } Page page; if (response.Content.Headers.ContentType != null && !MediaTypes.Contains(response.Content.Headers.ContentType.MediaType)) { if (!site.DownloadFiles) { Logger.MyLog(spider.Identity, $"Miss request: {request.Url} because media type is not text.", LogLevel.Error); return(new Page(request, site.ContentType, null) { IsSkip = true }); } else { page = SaveFile(request, response, spider); } } else { page = HandleResponse(request, response, site); } if (string.IsNullOrEmpty(page.Content)) { Logger.MyLog(spider.Identity, $"下载 {request.Url} 内容为空.", LogLevel.Warn); } // need update page.TargetUrl = request.Url.ToString(); //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html")); // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度 // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开 // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); //#if !NET_CORE // httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue; //#endif return(page); //正常结果在上面已经Return了, 到此处必然是下载失败的值. //throw new SpiderExceptoin("Download failed."); } catch (DownloadException de) { Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, site.ContentType, null); page.Exception = de; Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {de.Message}", LogLevel.Warn); return(page); } catch (HttpRequestException he) { Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, site.ContentType, null); page.Exception = he; Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {he.Message}.", LogLevel.Warn); return(page); } catch (Exception e) { Page page = new Page(request, site.ContentType, null); page.Exception = e; page.IsSkip = true; Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", LogLevel.Error, e); return(page); } finally { // 先Close Response, 避免前面语句异常导致没有关闭. try { //ensure the connection is released back to pool //check: //EntityUtils.consume(httpResponse.getEntity()); response?.Dispose(); } catch (Exception e) { Logger.MyLog(spider.Identity, "Close response fail.", LogLevel.Error, e); } } }
/// <summary> /// 下载工作的具体实现 /// </summary> /// <param name="request">请求信息</param> /// <param name="spider">爬虫</param> /// <returns>页面数据</returns> protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; try { lock (_locker) { if (_webDriver == null) { _webDriver = WebDriverUtil.Open(_browser, _option); if (_domains != null) { foreach (var domain in _domains) { var cookies = CookieContainer.GetCookies(new Uri(domain)); foreach (System.Net.Cookie cookie in cookies) { AddCookieToDownloadClient(cookie); } } } if (!_isLogined && CookieInjector != null) { var webdriverLoginHandler = CookieInjector as WebDriverLoginHandler; if (webdriverLoginHandler != null) { webdriverLoginHandler.Driver = _webDriver as RemoteWebDriver; } CookieInjector.Inject(this, spider); _isLogined = true; } } } //#if NET_CORE // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#else // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#endif // string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}"; var domainUrl = $"{request.Uri.Scheme}://{request.Uri.DnsSafeHost}{(request.Uri.Port == 80 ? "" : ":" + request.Uri.Port)}"; string realUrl = request.Url.ToString(); NetworkCenter.Current.Execute("webdriver-download", () => { _webDriver.Navigate().GoToUrl(realUrl); if (WebDriverHandlers != null) { foreach (var handler in WebDriverHandlers) { handler.Handle((RemoteWebDriver)_webDriver); } } }); Thread.Sleep(_webDriverWaitTime); Page page = new Page(request) { Content = _webDriver.PageSource, TargetUrl = _webDriver.Url }; return(page); } catch (DownloadException de) { Page page = new Page(request) { Exception = de }; if (site.CycleRetryTimes > 0) { page = Spider.AddToCycleRetry(request, site); } Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", Level.Warn); return(page); } catch (Exception e) { Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", Level.Warn); Page page = new Page(request) { Exception = e }; return(page); } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; try { lock (this) { _webDriver = _webDriver ?? WebDriverExtensions.Open(_browser, _option); if (!_isLogined && Login != null) { _isLogined = Login.Handle(_webDriver as RemoteWebDriver); if (!_isLogined) { throw new DownloadException("Login failed. Please check your login codes."); } } } Uri uri = request.Url; //#if NET_CORE // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#else // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#endif // string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}"; var domainUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}"; var options = _webDriver.Manage(); if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies.PairPart.Count > 0) { _webDriver.Url = domainUrl; options.Cookies.DeleteAllCookies(); foreach (var c in spider.Site.Cookies.PairPart) { options.Cookies.AddCookie(new OpenQA.Selenium.Cookie(c.Key, c.Value)); } } string realUrl = request.Url.ToString(); NetworkCenter.Current.Execute("wdd", () => { _webDriver.Navigate().GoToUrl(realUrl); NavigateCompeleted?.Invoke((RemoteWebDriver)_webDriver); }); Thread.Sleep(_webDriverWaitTime); Page page = new Page(request, site.RemoveOutboundLinks ? site.Domains : null) { Content = _webDriver.PageSource, TargetUrl = _webDriver.Url, Title = _webDriver.Title }; // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); return(page); } catch (DownloadException de) { Page page = new Page(request, null) { Exception = de }; if (site.CycleRetryTimes > 0) { page = Spider.AddToCycleRetry(request, site); } Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", NLog.LogLevel.Warn); return(page); } catch (Exception e) { Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", NLog.LogLevel.Warn); Page page = new Page(request, null) { Exception = e }; return(page); } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; HttpWebResponse response = null; var proxy = site.GetHttpProxy(); try { var httpMessage = GenerateHttpWebRequest(request, site); if (proxy != null) { httpMessage.Proxy = proxy; } response = NetworkCenter.Current.Execute("http", () => (HttpWebResponse)httpMessage.GetResponse()); request.StatusCode = response.StatusCode; if (!site.AcceptStatCode.Contains(response.StatusCode)) { throw new DownloadException($"Download {request.Url} failed. Code {response.StatusCode}"); } Page page; var mediaType = response.ContentType.Split(';').FirstOrDefault(); if (!string.IsNullOrEmpty(mediaType) && !MediaTypes.Contains(mediaType)) { if (!site.DownloadFiles) { Logger.MyLog(spider.Identity, $"Miss request: {request.Url} because media type is not text.", LogLevel.Error); return(new Page(request, null) { Skip = true }); } else { page = SaveFile(request, response, spider); } } else { page = ConstructPage(request, response, site); } if (string.IsNullOrEmpty(page.Content)) { Logger.MyLog(spider.Identity, $"Content is empty: {request.Url}.", LogLevel.Warn); } page.TargetUrl = response.ResponseUri.ToString(); return(page); } catch (DownloadException de) { Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, null); if (page != null) { page.Exception = de; } Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {de.Message}", LogLevel.Warn); return(page); } catch (WebException he) { Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, null); if (page != null) { page.Exception = he; } Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {he.Message}.", LogLevel.Warn); return(page); } catch (Exception e) { Page page = new Page(request, null) { Exception = e, Skip = true }; Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {e.Message}.", LogLevel.Error, e); return(page); } finally { // 先Close Response, 避免前面语句异常导致没有关闭. try { //ensure the connection is released back to pool //check: //EntityUtils.consume(httpResponse.getEntity()); response?.Dispose(); } catch (Exception e) { Logger.MyLog(spider.Identity, "Close response fail.", LogLevel.Error, e); } } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; try { lock (_locker) { _webDriver = _webDriver ?? WebDriverExtensions.Open(_browser, _option); if (!_isLogined && Login != null) { _isLogined = Login.Handle(_webDriver as RemoteWebDriver); if (!_isLogined) { throw new DownloadException("Login failed. Please check your login codes."); } } } Uri uri = request.Url; //#if NET_CORE // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#else // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#endif // string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}"; //var domainUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}"; //var options = _webDriver.Manage(); //if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies?.PairPart.Count > 0) //{ // _webDriver.Url = domainUrl; // options.Cookies.DeleteAllCookies(); // if (spider.Site.Cookies != null) // { // foreach (var c in spider.Site.Cookies.PairPart) // { // options.Cookies.AddCookie(new Cookie(c.Key, c.Value)); // } // } //} string realUrl = request.Url.ToString(); NetworkCenter.Current.Execute("webdriver-download", () => { _webDriver.Navigate().GoToUrl(realUrl); NavigateCompeleted?.Invoke((RemoteWebDriver)_webDriver); }); Thread.Sleep(_webDriverWaitTime); #region [WDY] #region [WDY] 表单填充+提交 if (FormSubmit != null) { FormSubmit.Handle(_webDriver as RemoteWebDriver); } #endregion #region [WDY] 弹出页面 if (_returnPopupPage) { var currentWindowHandle = _webDriver.CurrentWindowHandle; var popupWindowHandle = string.Empty; foreach (var handle in _webDriver.WindowHandles) { if (handle != currentWindowHandle) { popupWindowHandle = handle; break; } } if (!string.IsNullOrEmpty(popupWindowHandle)) { _webDriver.SwitchTo().Window(popupWindowHandle); } } #endregion // [WDY] 页面内容, 禁止移动位置 string content = _webDriver.PageSource; #region [WDY] iframe /* * [WDY] Iframe 设置 * ReturnIframe = true 时, 返回 Iframe 的 html */ string iframeContent = null; if (_option.IframeOption != null && _option.IframeOption.ReturnIframe) { if (!string.IsNullOrEmpty(_option.IframeOption.IframeUrl) || !string.IsNullOrEmpty(_option.IframeOption.IframeName)) { var iframeElement = !string.IsNullOrEmpty(_option.IframeOption.IframeUrl) ? _webDriver.FindElement(By.XPath($"//iframe[contains(@src,'{_option.IframeOption.IframeUrl}')]")) : _webDriver.FindElement(By.XPath($"//iframe[contains(@name,'{_option.IframeOption.IframeName}')]")); _webDriver.SwitchTo().Frame(iframeElement); // 此处 switchTo iframe 后, 下边 _webDriver.PageSource 则会返回iframe的html iframeContent = _webDriver.PageSource; //_webDriver.SwitchTo().ParentFrame(); } } #endregion #endregion Page page = new Page(request, site.RemoveOutboundLinks ? site.Domains : null) { Content = _webDriver.PageSource, IframeContent = iframeContent, //[WDY] TargetUrl = _webDriver.Url, Title = _webDriver.Title }; // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 //request.PutExtra(Request.CycleTriedTimes, null); return(page); } catch (DownloadException de) { Page page = new Page(request, null) { Exception = de }; if (site.CycleRetryTimes > 0) { page = Spider.AddToCycleRetry(request, site); } Logger.AllLog(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", NLog.LogLevel.Warn); return(page); } catch (Exception e) { Logger.AllLog(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", NLog.LogLevel.Warn); Page page = new Page(request, null) { Exception = e }; return(page); } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; try { lock (_locker) { _webDriver = _webDriver ?? WebDriverExtensions.Open(_browser, _option); foreach (var domain in _domains) { var cookies = _cookieContainer.GetCookies(new Uri(domain)); foreach (System.Net.Cookie cookie in cookies) { AddCookieToDownloadClient(cookie); } } if (!_isLogined && CookieInjector != null) { var webdriverLoginHandler = CookieInjector as WebDriverLoginHandler; if (webdriverLoginHandler != null) { webdriverLoginHandler.Driver = _webDriver as RemoteWebDriver; } CookieInjector.Inject(this, spider); _isLogined = true; } } //#if NET_CORE // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#else // string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; //#endif // string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}"; var domainUrl = $"{request.Uri.Scheme}://{request.Uri.DnsSafeHost}{(request.Uri.Port == 80 ? "" : ":" + request.Uri.Port)}"; // TODO:重新实现WebDriverDownloader设置Cookie //var options = _webDriver.Manage(); //if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies?.PairPart.Count > 0) //{ // _webDriver.Url = domainUrl; // options.Cookies.DeleteAllCookies(); // if (spider.Site.Cookies != null) // { // foreach (var c in spider.Site.Cookies.PairPart) // { // options.Cookies.AddCookie(new Cookie(c.Key, c.Value)); // } // } //} string realUrl = request.Url.ToString(); NetworkCenter.Current.Execute("webdriver-download", () => { _webDriver.Navigate().GoToUrl(realUrl); NavigateCompeleted?.Invoke((RemoteWebDriver)_webDriver); }); Thread.Sleep(_webDriverWaitTime); Page page = new Page(request) { Content = _webDriver.PageSource, TargetUrl = _webDriver.Url }; // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 //request.PutExtra(Request.CycleTriedTimes, null); return(page); } catch (DownloadException de) { Page page = new Page(request) { Exception = de }; if (site.CycleRetryTimes > 0) { page = Spider.AddToCycleRetry(request, site); } Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", Level.Warn); return(page); } catch (Exception e) { Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", Level.Warn); Page page = new Page(request) { Exception = e }; return(page); } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; var proxy = site.GetHttpProxy(); request.Proxy = proxy; HttpResponseMessage response = null; try { var httpMessage = GenerateHttpRequestMessage(request, site); HttpClient httpClient = null == spider.Site.HttpProxyPool ? HttpClient : _httpClientPool.GetHttpClient(proxy); response = NetworkCenter.Current.Execute("http", () => httpClient.SendAsync(httpMessage).Result); request.StatusCode = response.StatusCode; response.EnsureSuccessStatusCode(); Page page; if (response.Content.Headers.ContentType != null && !ExcludeMediaTypes.Contains(response.Content.Headers.ContentType.MediaType)) { if (!site.DownloadFiles) { Logger.AllLog(spider.Identity, $"Ignore: {request.Url} because media type is not allowed to download.", LogLevel.Warn); return(new Page(request) { Skip = true }); } else { page = SaveFile(request, response, spider); } } else { page = HandleResponse(request, response, site); if (string.IsNullOrEmpty(page.Content)) { Logger.AllLog(spider.Identity, $"Content is empty: {request.Url}.", LogLevel.Warn); } } page.TargetUrl = response.RequestMessage.RequestUri.AbsoluteUri; return(page); } catch (DownloadException de) { Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request); if (page != null) { page.Exception = de; } Logger.AllLog(spider.Identity, $"Download {request.Url} failed: {de.Message}", LogLevel.Warn); return(page); } catch (HttpRequestException he) { Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request); if (page != null) { page.Exception = he; } Logger.AllLog(spider.Identity, $"Download {request.Url} failed: {he.Message}.", LogLevel.Warn); return(page); } catch (Exception e) { Page page = new Page(request) { Exception = e, Skip = true }; Logger.AllLog(spider.Identity, $"Download {request.Url} failed: {e.Message}.", LogLevel.Error, e); return(page); } finally { try { response?.Dispose(); } catch (Exception e) { Logger.AllLog(spider.Identity, $"Close response fail: {e}", LogLevel.Error, e); } } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; HttpResponseMessage response = null; var proxy = site.GetHttpProxy(); request.Proxy = proxy; try { var httpMessage = GenerateHttpRequestMessage(request, site); HttpClient httpClient = null == spider.Site.HttpProxyPool ? _httpClient : _httpClientPool.GetHttpClient(proxy); response = NetworkCenter.Current.Execute("http", () => httpClient.SendAsync(httpMessage).Result); request.StatusCode = response.StatusCode; response.EnsureSuccessStatusCode(); if (!site.AcceptStatCode.Contains(response.StatusCode)) { throw new DownloadException($"Download {request.Url} failed. Code {response.StatusCode}"); } Page page; if (response.Content.Headers.ContentType != null && !MediaTypes.Contains(response.Content.Headers.ContentType.MediaType)) { if (!site.DownloadFiles) { Logger.MyLog(spider.Identity, $"Miss request: {request.Url} because media type is not text.", LogLevel.Error); return(new Page(request, null) { Skip = true }); } else { page = SaveFile(request, response, spider); } } else { page = HandleResponse(request, response, site); if (string.IsNullOrEmpty(page.Content)) { Logger.MyLog(spider.Identity, $"Content is empty: {request.Url}.", LogLevel.Warn); } } page.TargetUrl = response.RequestMessage.RequestUri.AbsoluteUri; return(page); } catch (DownloadException de) { Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, null); if (page != null) { page.Exception = de; } Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {de.Message}", LogLevel.Warn); return(page); } catch (HttpRequestException he) { Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, null); if (page != null) { page.Exception = he; } Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {he.Message}.", LogLevel.Warn); return(page); } catch (Exception e) { Page page = new Page(request, null) { Exception = e, Skip = true }; Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {e.Message}.", LogLevel.Error, e); return(page); } finally { // 先Close Response, 避免前面语句异常导致没有关闭. try { //ensure the connection is released back to pool //check: //EntityUtils.consume(httpResponse.getEntity()); response?.Dispose(); } catch (Exception e) { Logger.MyLog(spider.Identity, "Close response fail.", LogLevel.Error, e); } } }
/// <summary> /// HTTP下载的实现 /// </summary> /// <param name="request">请求信息</param> /// <param name="spider">爬虫</param> /// <returns>页面数据</returns> protected override Page DowloadContent(Request request, ISpider spider) { HttpResponseMessage response = null; try { var httpMessage = GenerateHttpRequestMessage(request, spider.Site); HttpClientItem httpClientItem; if (spider.Site.HttpProxyPool == null) { // Request可以设置不同的DownloaderGroup来使用不同的HttpClient httpClientItem = HttpClientPool.GetHttpClient(spider, this, CookieContainer, request.DownloaderGroup, CookieInjector); } else { // TODO: 代理模式下: request.DownloaderGroup 再考虑 var proxy = spider.Site.HttpProxyPool.GetProxy(); request.Proxy = proxy; httpClientItem = HttpClientPool.GetHttpClient(spider, this, CookieContainer, proxy?.GetHashCode(), CookieInjector); httpClientItem.Handler.Proxy = httpClientItem.Handler.Proxy ?? proxy; } if (!Equals(httpClientItem.Client.Timeout.TotalSeconds, _timeout)) { httpClientItem.Client.Timeout = new TimeSpan(0, 0, (int)_timeout); } response = NetworkCenter.Current.Execute("http", () => httpClientItem.Client.SendAsync(httpMessage).Result); request.StatusCode = response.StatusCode; response.EnsureSuccessStatusCode(); Page page; if (response.Content.Headers.ContentType != null && !ExcludeMediaTypes.Contains(response.Content.Headers.ContentType.MediaType)) { if (!spider.Site.DownloadFiles) { Logger.Log(spider.Identity, $"Ignore: {request.Url} because media type is not allowed to download.", Level.Warn); return(new Page(request) { Skip = true }); } else { page = SaveFile(request, response, spider); } } else { page = HandleResponse(request, response, spider.Site); if (string.IsNullOrWhiteSpace(page.Content)) { Logger.Log(spider.Identity, $"Content is empty: {request.Url}.", Level.Warn); } } page.TargetUrl = response.RequestMessage.RequestUri.AbsoluteUri; return(page); } catch (DownloadException de) { Page page = spider.Site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, spider.Site) : new Page(request); if (page != null) { page.Exception = de; } Logger.Log(spider.Identity, $"Download {request.Url} failed: {de.Message}", Level.Warn); return(page); } catch (HttpRequestException he) { Page page = spider.Site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, spider.Site) : new Page(request); if (page != null) { page.Exception = he; } Logger.Log(spider.Identity, $"Download {request.Url} failed: {he.Message}.", Level.Warn); return(page); } catch (Exception e) { Page page = new Page(request) { Exception = e, Skip = true }; Logger.Log(spider.Identity, $"Download {request.Url} failed: {e.Message}.", Level.Error, e); return(page); } finally { try { response?.Dispose(); } catch (Exception e) { Logger.Log(spider.Identity, $"Close response fail: {e}", Level.Error, e); } } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; try { lock (this) { if (_webDriver == null) { _webDriver = WebDriverUtil.Open(_browser, _option); } if (!_isLogined && SignIn != null) { _isLogined = SignIn.Handle(_webDriver as RemoteWebDriver); if (!_isLogined) { throw new SpiderException("Login failed. Please check your login codes."); } } } //中文乱码URL Uri uri = request.Url; string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}"; var domainUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}"; var options = _webDriver.Manage(); if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies.PairPart.Count > 0) { _webDriver.Url = domainUrl; options.Cookies.DeleteAllCookies(); foreach (var c in spider.Site.Cookies.PairPart) { options.Cookies.AddCookie(new Cookie(c.Key, c.Value)); } } if (UrlHandler != null) { realUrl = UrlHandler(realUrl); } NetworkCenter.Current.Execute("wd-d", () => { _webDriver.Navigate().GoToUrl(realUrl); NavigateCompeleted?.Handle((RemoteWebDriver)_webDriver); }); Thread.Sleep(_webDriverWaitTime); Page page = new Page(request, spider.Site.ContentType, site.RemoveOutboundLinks ? site.Domains : null) { Content = _webDriver.PageSource, TargetUrl = _webDriver.Url, Title = _webDriver.Title }; // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); return(page); } catch (DownloadException de) { Page page = new Page(request, site.ContentType, null) { Exception = de }; if (site.CycleRetryTimes > 0) { page = Spider.AddToCycleRetry(request, site); } spider.Log($"下载 {request.Url} 失败: {de.Message}", Core.Infrastructure.LogLevel.Warn); return(page); } catch (HttpRequestException he) { Page page = new Page(request, site.ContentType, null) { Exception = he }; if (site.CycleRetryTimes > 0) { page = Spider.AddToCycleRetry(request, site); } spider.Log($"下载 {request.Url} 失败: {he.Message}", Core.Infrastructure.LogLevel.Warn); return(page); } catch (Exception e) { Page page = new Page(request, site.ContentType, null) { Exception = e }; return(page); } }