private Page CreateRetryPage(Exception e, Request request, ISpider spider)
        {
            Page page = spider.Site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, spider.Site) : new Page(request);

            if (page != null)
            {
                page.Exception = e;
            }

            Logger.Log(spider.Identity, $"Download {request.Url} failed: {e.Message}.", Level.Warn);
            return(page);
        }
Exemplo n.º 2
0
 public override void Handle(ref Page page, ISpider spider)
 {
     if (page != null && !string.IsNullOrEmpty(page.Content) && !string.IsNullOrEmpty(Content) && page.Content.Contains(Content))
     {
         if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
         {
             Logger.MyLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error);
             spider.Exit();
         }
         page           = Spider.AddToCycleRetry(page.Request, spider.Site);
         page.Exception = new DownloadException($"Content downloaded contains string: {Content}.");
     }
 }
Exemplo n.º 3
0
 public override void Handle(ref Page page, ISpider spider)
 {
     if (page != null && !string.IsNullOrEmpty(page.Content) && !string.IsNullOrEmpty(Content) && CookieInjector != null && page.Content.Contains(Content))
     {
         if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
         {
             spider.Exit();
         }
         Spider.AddToCycleRetry(page.Request, spider.Site);
         CookieInjector?.Inject(spider);
         page.Exception = new DownloadException($"Content downloaded contains string: {Content}.");
     }
 }
        /// <summary>
        /// 当页面数据中的异常信息包含指定内容时触发ADSL拨号
        /// </summary>
        /// <param name="page">页面数据</param>
        /// <param name="spider">爬虫</param>
        public override void Handle(ref Page page, ISpider spider)
        {
            if (page == null || string.IsNullOrEmpty(page.Content) || string.IsNullOrWhiteSpace(page.Content) || page.Exception == null)
            {
                return;
            }
            if (page.Exception.Message.Contains(_exceptionMessage))
            {
                if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
                {
                    Logger.AllLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error);
                    spider.Exit();
                }

                Spider.AddToCycleRetry(page.Request, spider.Site);
                page.Exception = new DownloadException("Download failed and redial finished already.");
            }
        }
Exemplo n.º 5
0
 /// <summary>
 /// 当页面数据包含指定内容时触发ADSL拨号, 并且重新获取Cookie
 /// </summary>
 /// <param name="page">页面数据</param>
 /// <param name="spider">爬虫</param>
 public override void Handle(ref Page page, ISpider spider)
 {
     if (!string.IsNullOrEmpty(page?.Content))
     {
         var content        = page.Content;
         var containContent = _contents.FirstOrDefault(c => content.Contains(c));
         if (containContent != null)
         {
             if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
             {
                 spider.Exit();
             }
             Spider.AddToCycleRetry(page.Request, spider.Site);
             _cookieInjector.Inject(spider);
             page.Exception = new DownloadException($"Downloaded content contains: {containContent}.");
         }
     }
 }
Exemplo n.º 6
0
 /// <summary>
 /// 当包含指定内容时触发ADSL拨号
 /// </summary>
 /// <param name="page">页面数据</param>
 /// <param name="spider">爬虫</param>
 public override void Handle(ref Page page, ISpider spider)
 {
     if (!string.IsNullOrEmpty(page?.Content))
     {
         var content        = page.Content;
         var containContent = _contents.FirstOrDefault(c => content.Contains(c));
         if (containContent != null)
         {
             if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
             {
                 Logger.AllLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error);
                 spider.Exit();
             }
             page           = Spider.AddToCycleRetry(page.Request, spider.Site);
             page.Exception = new DownloadException($"Downloaded content contains: {containContent}.");
         }
     }
 }
Exemplo n.º 7
0
        public override void Handle(ref Page page, ISpider spider)
        {
            if (RedialLimit != 0)
            {
                lock (_locker)
                {
                    ++RequestedCount;

                    if (RedialLimit > 0 && RequestedCount == RedialLimit)
                    {
                        RequestedCount = 0;
                        Spider.AddToCycleRetry(page.Request, spider.Site);
                        if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
                        {
                            spider.Exit();
                        }
                    }
                }
            }
        }
Exemplo n.º 8
0
 public override void Handle(ref Page page, ISpider spider)
 {
     if (page != null && !string.IsNullOrEmpty(page.Content) && !string.IsNullOrEmpty(ExceptionMessage) && page.Exception != null)
     {
         if (string.IsNullOrEmpty(ExceptionMessage))
         {
             page.Exception = new SpiderException("ExceptionMessage should not be empty/null.");
         }
         if (page.Exception.Message.Contains(ExceptionMessage))
         {
             if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
             {
                 Logger.MyLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error);
                 spider.Exit();
             }
             Spider.AddToCycleRetry(page.Request, spider.Site);
             page.Exception = new DownloadException("Download failed and redial finished already.");
         }
     }
 }
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            HttpResponseMessage response = null;
            var proxy = site.GetHttpProxy();

            request.PutExtra(Request.Proxy, proxy);
            try
            {
                var httpMessage = GenerateHttpRequestMessage(request, site);

                response = NetworkCenter.Current.Execute("http", message =>
                {
                    HttpClient httpClient = _httpClientPool.GetHttpClient(proxy);
                    var requestTask       = httpClient.SendAsync(message);
                    requestTask.Wait(site.Timeout);
                    if (requestTask.Status == TaskStatus.RanToCompletion)
                    {
                        return(requestTask.Result);
                    }
                    else
                    {
                        return(new HttpResponseMessage(HttpStatusCode.RequestTimeout));
                    }
                }, httpMessage);
                request.StatusCode = response.StatusCode;
                response.EnsureSuccessStatusCode();

                if (!site.AcceptStatCode.Contains(response.StatusCode))
                {
                    throw new DownloadException($"下载 {request.Url} 失败. Code {response.StatusCode}");
                }
                Page page;

                if (response.Content.Headers.ContentType != null && !MediaTypes.Contains(response.Content.Headers.ContentType.MediaType))
                {
                    if (!site.DownloadFiles)
                    {
                        Logger.MyLog(spider.Identity, $"Miss request: {request.Url} because media type is not text.", LogLevel.Error);
                        return(new Page(request, site.ContentType, null)
                        {
                            IsSkip = true
                        });
                    }
                    else
                    {
                        page = SaveFile(request, response, spider);
                    }
                }
                else
                {
                    page = HandleResponse(request, response, site);
                }

                if (string.IsNullOrEmpty(page.Content))
                {
                    Logger.MyLog(spider.Identity, $"下载 {request.Url} 内容为空.", LogLevel.Warn);
                }

                // need update
                page.TargetUrl = request.Url.ToString();

                //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html"));

                // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度
                // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开

                // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);

                //#if !NET_CORE
                //	httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue;
                //#endif

                return(page);

                //正常结果在上面已经Return了, 到此处必然是下载失败的值.
                //throw new SpiderExceptoin("Download failed.");
            }
            catch (DownloadException de)
            {
                Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, site.ContentType, null);

                page.Exception = de;
                Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {de.Message}", LogLevel.Warn);

                return(page);
            }
            catch (HttpRequestException he)
            {
                Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, site.ContentType, null);
                page.Exception = he;
                Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {he.Message}.", LogLevel.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Page page = new Page(request, site.ContentType, null);
                page.Exception = e;
                page.IsSkip    = true;
                Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", LogLevel.Error, e);
                return(page);
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    Logger.MyLog(spider.Identity, "Close response fail.", LogLevel.Error, e);
                }
            }
        }
Exemplo n.º 10
0
        /// <summary>
        /// 下载工作的具体实现
        /// </summary>
        /// <param name="request">请求信息</param>
        /// <param name="spider">爬虫</param>
        /// <returns>页面数据</returns>
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (_locker)
                {
                    if (_webDriver == null)
                    {
                        _webDriver = WebDriverUtil.Open(_browser, _option);

                        if (_domains != null)
                        {
                            foreach (var domain in _domains)
                            {
                                var cookies = CookieContainer.GetCookies(new Uri(domain));
                                foreach (System.Net.Cookie cookie in cookies)
                                {
                                    AddCookieToDownloadClient(cookie);
                                }
                            }
                        }

                        if (!_isLogined && CookieInjector != null)
                        {
                            var webdriverLoginHandler = CookieInjector as WebDriverLoginHandler;
                            if (webdriverLoginHandler != null)
                            {
                                webdriverLoginHandler.Driver = _webDriver as RemoteWebDriver;
                            }
                            CookieInjector.Inject(this, spider);
                            _isLogined = true;
                        }
                    }
                }

                //#if NET_CORE
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#else
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#endif
                //				string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}";

                var domainUrl = $"{request.Uri.Scheme}://{request.Uri.DnsSafeHost}{(request.Uri.Port == 80 ? "" : ":" + request.Uri.Port)}";

                string realUrl = request.Url.ToString();

                NetworkCenter.Current.Execute("webdriver-download", () =>
                {
                    _webDriver.Navigate().GoToUrl(realUrl);

                    if (WebDriverHandlers != null)
                    {
                        foreach (var handler in WebDriverHandlers)
                        {
                            handler.Handle((RemoteWebDriver)_webDriver);
                        }
                    }
                });

                Thread.Sleep(_webDriverWaitTime);

                Page page = new Page(request)
                {
                    Content   = _webDriver.PageSource,
                    TargetUrl = _webDriver.Url
                };

                return(page);
            }
            catch (DownloadException de)
            {
                Page page = new Page(request)
                {
                    Exception = de
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = Spider.AddToCycleRetry(request, site);
                }
                Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", Level.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", Level.Warn);
                Page page = new Page(request)
                {
                    Exception = e
                };
                return(page);
            }
        }
Exemplo n.º 11
0
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (this)
                {
                    _webDriver = _webDriver ?? WebDriverExtensions.Open(_browser, _option);

                    if (!_isLogined && Login != null)
                    {
                        _isLogined = Login.Handle(_webDriver as RemoteWebDriver);
                        if (!_isLogined)
                        {
                            throw new DownloadException("Login failed. Please check your login codes.");
                        }
                    }
                }

                Uri uri = request.Url;

                //#if NET_CORE
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#else
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#endif
                //				string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}";

                var domainUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}";

                var options = _webDriver.Manage();
                if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies.PairPart.Count > 0)
                {
                    _webDriver.Url = domainUrl;
                    options.Cookies.DeleteAllCookies();
                    foreach (var c in spider.Site.Cookies.PairPart)
                    {
                        options.Cookies.AddCookie(new OpenQA.Selenium.Cookie(c.Key, c.Value));
                    }
                }

                string realUrl = request.Url.ToString();

                NetworkCenter.Current.Execute("wdd", () =>
                {
                    _webDriver.Navigate().GoToUrl(realUrl);

                    NavigateCompeleted?.Invoke((RemoteWebDriver)_webDriver);
                });

                Thread.Sleep(_webDriverWaitTime);

                Page page = new Page(request, site.RemoveOutboundLinks ? site.Domains : null)
                {
                    Content   = _webDriver.PageSource,
                    TargetUrl = _webDriver.Url,
                    Title     = _webDriver.Title
                };

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);
                return(page);
            }
            catch (DownloadException de)
            {
                Page page = new Page(request, null)
                {
                    Exception = de
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = Spider.AddToCycleRetry(request, site);
                }
                Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", NLog.LogLevel.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Logger.MyLog(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", NLog.LogLevel.Warn);
                Page page = new Page(request, null)
                {
                    Exception = e
                };
                return(page);
            }
        }
Exemplo n.º 12
0
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            HttpWebResponse response = null;
            var             proxy    = site.GetHttpProxy();

            try
            {
                var httpMessage = GenerateHttpWebRequest(request, site);

                if (proxy != null)
                {
                    httpMessage.Proxy = proxy;
                }
                response = NetworkCenter.Current.Execute("http", () => (HttpWebResponse)httpMessage.GetResponse());

                request.StatusCode = response.StatusCode;

                if (!site.AcceptStatCode.Contains(response.StatusCode))
                {
                    throw new DownloadException($"Download {request.Url} failed. Code {response.StatusCode}");
                }
                Page page;

                var mediaType = response.ContentType.Split(';').FirstOrDefault();
                if (!string.IsNullOrEmpty(mediaType) && !MediaTypes.Contains(mediaType))
                {
                    if (!site.DownloadFiles)
                    {
                        Logger.MyLog(spider.Identity, $"Miss request: {request.Url} because media type is not text.",
                                     LogLevel.Error);
                        return(new Page(request, null)
                        {
                            Skip = true
                        });
                    }
                    else
                    {
                        page = SaveFile(request, response, spider);
                    }
                }
                else
                {
                    page = ConstructPage(request, response, site);
                }

                if (string.IsNullOrEmpty(page.Content))
                {
                    Logger.MyLog(spider.Identity, $"Content is empty: {request.Url}.", LogLevel.Warn);
                }

                page.TargetUrl = response.ResponseUri.ToString();

                return(page);
            }
            catch (DownloadException de)
            {
                Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, null);

                if (page != null)
                {
                    page.Exception = de;
                }
                Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {de.Message}", LogLevel.Warn);

                return(page);
            }
            catch (WebException he)
            {
                Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, null);
                if (page != null)
                {
                    page.Exception = he;
                }

                Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {he.Message}.", LogLevel.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Page page = new Page(request, null)
                {
                    Exception = e,
                    Skip      = true
                };

                Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {e.Message}.", LogLevel.Error, e);
                return(page);
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    Logger.MyLog(spider.Identity, "Close response fail.", LogLevel.Error, e);
                }
            }
        }
Exemplo n.º 13
0
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (_locker)
                {
                    _webDriver = _webDriver ?? WebDriverExtensions.Open(_browser, _option);

                    if (!_isLogined && Login != null)
                    {
                        _isLogined = Login.Handle(_webDriver as RemoteWebDriver);
                        if (!_isLogined)
                        {
                            throw new DownloadException("Login failed. Please check your login codes.");
                        }
                    }
                }

                Uri uri = request.Url;

                //#if NET_CORE
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#else
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#endif
                //				string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}";

                //var domainUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}";

                //var options = _webDriver.Manage();
                //if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies?.PairPart.Count > 0)
                //{
                //	_webDriver.Url = domainUrl;
                //	options.Cookies.DeleteAllCookies();
                //	if (spider.Site.Cookies != null)
                //	{
                //		foreach (var c in spider.Site.Cookies.PairPart)
                //		{
                //			options.Cookies.AddCookie(new Cookie(c.Key, c.Value));
                //		}
                //	}
                //}

                string realUrl = request.Url.ToString();

                NetworkCenter.Current.Execute("webdriver-download", () =>
                {
                    _webDriver.Navigate().GoToUrl(realUrl);

                    NavigateCompeleted?.Invoke((RemoteWebDriver)_webDriver);
                });

                Thread.Sleep(_webDriverWaitTime);

                #region [WDY]

                #region [WDY] 表单填充+提交

                if (FormSubmit != null)
                {
                    FormSubmit.Handle(_webDriver as RemoteWebDriver);
                }

                #endregion

                #region [WDY] 弹出页面

                if (_returnPopupPage)
                {
                    var currentWindowHandle = _webDriver.CurrentWindowHandle;
                    var popupWindowHandle   = string.Empty;
                    foreach (var handle in _webDriver.WindowHandles)
                    {
                        if (handle != currentWindowHandle)
                        {
                            popupWindowHandle = handle;
                            break;
                        }
                    }
                    if (!string.IsNullOrEmpty(popupWindowHandle))
                    {
                        _webDriver.SwitchTo().Window(popupWindowHandle);
                    }
                }

                #endregion

                // [WDY] 页面内容, 禁止移动位置
                string content = _webDriver.PageSource;

                #region [WDY] iframe

                /*
                 *  [WDY] Iframe 设置
                 *  ReturnIframe = true 时, 返回 Iframe 的 html
                 */

                string iframeContent = null;
                if (_option.IframeOption != null && _option.IframeOption.ReturnIframe)
                {
                    if (!string.IsNullOrEmpty(_option.IframeOption.IframeUrl) || !string.IsNullOrEmpty(_option.IframeOption.IframeName))
                    {
                        var iframeElement = !string.IsNullOrEmpty(_option.IframeOption.IframeUrl)
                            ? _webDriver.FindElement(By.XPath($"//iframe[contains(@src,'{_option.IframeOption.IframeUrl}')]"))
                            : _webDriver.FindElement(By.XPath($"//iframe[contains(@name,'{_option.IframeOption.IframeName}')]"));

                        _webDriver.SwitchTo().Frame(iframeElement); // 此处 switchTo iframe 后, 下边 _webDriver.PageSource 则会返回iframe的html

                        iframeContent = _webDriver.PageSource;

                        //_webDriver.SwitchTo().ParentFrame();
                    }
                }

                #endregion

                #endregion

                Page page = new Page(request, site.RemoveOutboundLinks ? site.Domains : null)
                {
                    Content       = _webDriver.PageSource,
                    IframeContent = iframeContent,  //[WDY]
                    TargetUrl     = _webDriver.Url,
                    Title         = _webDriver.Title
                };

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                //request.PutExtra(Request.CycleTriedTimes, null);
                return(page);
            }
            catch (DownloadException de)
            {
                Page page = new Page(request, null)
                {
                    Exception = de
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = Spider.AddToCycleRetry(request, site);
                }
                Logger.AllLog(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", NLog.LogLevel.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Logger.AllLog(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", NLog.LogLevel.Warn);
                Page page = new Page(request, null)
                {
                    Exception = e
                };
                return(page);
            }
        }
Exemplo n.º 14
0
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (_locker)
                {
                    _webDriver = _webDriver ?? WebDriverExtensions.Open(_browser, _option);

                    foreach (var domain in _domains)
                    {
                        var cookies = _cookieContainer.GetCookies(new Uri(domain));
                        foreach (System.Net.Cookie cookie in cookies)
                        {
                            AddCookieToDownloadClient(cookie);
                        }
                    }

                    if (!_isLogined && CookieInjector != null)
                    {
                        var webdriverLoginHandler = CookieInjector as WebDriverLoginHandler;
                        if (webdriverLoginHandler != null)
                        {
                            webdriverLoginHandler.Driver = _webDriver as RemoteWebDriver;
                        }
                        CookieInjector.Inject(this, spider);
                        _isLogined = true;
                    }
                }

                //#if NET_CORE
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#else
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#endif
                //				string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}";

                var domainUrl = $"{request.Uri.Scheme}://{request.Uri.DnsSafeHost}{(request.Uri.Port == 80 ? "" : ":" + request.Uri.Port)}";

                // TODO:重新实现WebDriverDownloader设置Cookie
                //var options = _webDriver.Manage();
                //if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies?.PairPart.Count > 0)
                //{
                //	_webDriver.Url = domainUrl;
                //	options.Cookies.DeleteAllCookies();
                //	if (spider.Site.Cookies != null)
                //	{
                //		foreach (var c in spider.Site.Cookies.PairPart)
                //		{
                //			options.Cookies.AddCookie(new Cookie(c.Key, c.Value));
                //		}
                //	}
                //}

                string realUrl = request.Url.ToString();

                NetworkCenter.Current.Execute("webdriver-download", () =>
                {
                    _webDriver.Navigate().GoToUrl(realUrl);

                    NavigateCompeleted?.Invoke((RemoteWebDriver)_webDriver);
                });

                Thread.Sleep(_webDriverWaitTime);

                Page page = new Page(request)
                {
                    Content   = _webDriver.PageSource,
                    TargetUrl = _webDriver.Url
                };

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                //request.PutExtra(Request.CycleTriedTimes, null);
                return(page);
            }
            catch (DownloadException de)
            {
                Page page = new Page(request)
                {
                    Exception = de
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = Spider.AddToCycleRetry(request, site);
                }
                Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {de.Message}.", Level.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Logger.Log(spider.Identity, $"下载 {request.Url} 失败: {e.Message}.", Level.Warn);
                Page page = new Page(request)
                {
                    Exception = e
                };
                return(page);
            }
        }
Exemplo n.º 15
0
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site  = spider.Site;
            var  proxy = site.GetHttpProxy();

            request.Proxy = proxy;
            HttpResponseMessage response = null;

            try
            {
                var        httpMessage = GenerateHttpRequestMessage(request, site);
                HttpClient httpClient  = null == spider.Site.HttpProxyPool ? HttpClient : _httpClientPool.GetHttpClient(proxy);
                response           = NetworkCenter.Current.Execute("http", () => httpClient.SendAsync(httpMessage).Result);
                request.StatusCode = response.StatusCode;
                response.EnsureSuccessStatusCode();

                Page page;

                if (response.Content.Headers.ContentType != null && !ExcludeMediaTypes.Contains(response.Content.Headers.ContentType.MediaType))
                {
                    if (!site.DownloadFiles)
                    {
                        Logger.AllLog(spider.Identity, $"Ignore: {request.Url} because media type is not allowed to download.", LogLevel.Warn);
                        return(new Page(request)
                        {
                            Skip = true
                        });
                    }
                    else
                    {
                        page = SaveFile(request, response, spider);
                    }
                }
                else
                {
                    page = HandleResponse(request, response, site);

                    if (string.IsNullOrEmpty(page.Content))
                    {
                        Logger.AllLog(spider.Identity, $"Content is empty: {request.Url}.", LogLevel.Warn);
                    }
                }

                page.TargetUrl = response.RequestMessage.RequestUri.AbsoluteUri;

                return(page);
            }
            catch (DownloadException de)
            {
                Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request);

                if (page != null)
                {
                    page.Exception = de;
                }
                Logger.AllLog(spider.Identity, $"Download {request.Url} failed: {de.Message}", LogLevel.Warn);

                return(page);
            }
            catch (HttpRequestException he)
            {
                Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request);
                if (page != null)
                {
                    page.Exception = he;
                }

                Logger.AllLog(spider.Identity, $"Download {request.Url} failed: {he.Message}.", LogLevel.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Page page = new Page(request)
                {
                    Exception = e,
                    Skip      = true
                };

                Logger.AllLog(spider.Identity, $"Download {request.Url} failed: {e.Message}.", LogLevel.Error, e);
                return(page);
            }
            finally
            {
                try
                {
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    Logger.AllLog(spider.Identity, $"Close response fail: {e}", LogLevel.Error, e);
                }
            }
        }
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            HttpResponseMessage response = null;
            var proxy = site.GetHttpProxy();

            request.Proxy = proxy;

            try
            {
                var httpMessage = GenerateHttpRequestMessage(request, site);

                HttpClient httpClient = null == spider.Site.HttpProxyPool ? _httpClient : _httpClientPool.GetHttpClient(proxy);

                response = NetworkCenter.Current.Execute("http", () => httpClient.SendAsync(httpMessage).Result);

                request.StatusCode = response.StatusCode;
                response.EnsureSuccessStatusCode();

                if (!site.AcceptStatCode.Contains(response.StatusCode))
                {
                    throw new DownloadException($"Download {request.Url} failed. Code {response.StatusCode}");
                }
                Page page;

                if (response.Content.Headers.ContentType != null && !MediaTypes.Contains(response.Content.Headers.ContentType.MediaType))
                {
                    if (!site.DownloadFiles)
                    {
                        Logger.MyLog(spider.Identity, $"Miss request: {request.Url} because media type is not text.", LogLevel.Error);
                        return(new Page(request, null)
                        {
                            Skip = true
                        });
                    }
                    else
                    {
                        page = SaveFile(request, response, spider);
                    }
                }
                else
                {
                    page = HandleResponse(request, response, site);

                    if (string.IsNullOrEmpty(page.Content))
                    {
                        Logger.MyLog(spider.Identity, $"Content is empty: {request.Url}.", LogLevel.Warn);
                    }
                }

                page.TargetUrl = response.RequestMessage.RequestUri.AbsoluteUri;

                return(page);
            }
            catch (DownloadException de)
            {
                Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, null);

                if (page != null)
                {
                    page.Exception = de;
                }
                Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {de.Message}", LogLevel.Warn);

                return(page);
            }
            catch (HttpRequestException he)
            {
                Page page = site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, site) : new Page(request, null);
                if (page != null)
                {
                    page.Exception = he;
                }

                Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {he.Message}.", LogLevel.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Page page = new Page(request, null)
                {
                    Exception = e,
                    Skip      = true
                };

                Logger.MyLog(spider.Identity, $"Download {request.Url} failed: {e.Message}.", LogLevel.Error, e);
                return(page);
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    Logger.MyLog(spider.Identity, "Close response fail.", LogLevel.Error, e);
                }
            }
        }
Exemplo n.º 17
0
        /// <summary>
        /// HTTP下载的实现
        /// </summary>
        /// <param name="request">请求信息</param>
        /// <param name="spider">爬虫</param>
        /// <returns>页面数据</returns>
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            HttpResponseMessage response = null;

            try
            {
                var httpMessage = GenerateHttpRequestMessage(request, spider.Site);

                HttpClientItem httpClientItem;
                if (spider.Site.HttpProxyPool == null)
                {
                    // Request可以设置不同的DownloaderGroup来使用不同的HttpClient
                    httpClientItem = HttpClientPool.GetHttpClient(spider, this, CookieContainer, request.DownloaderGroup, CookieInjector);
                }
                else
                {
                    // TODO: 代理模式下: request.DownloaderGroup 再考虑
                    var proxy = spider.Site.HttpProxyPool.GetProxy();
                    request.Proxy  = proxy;
                    httpClientItem = HttpClientPool.GetHttpClient(spider, this, CookieContainer, proxy?.GetHashCode(), CookieInjector);
                    httpClientItem.Handler.Proxy = httpClientItem.Handler.Proxy ?? proxy;
                }
                if (!Equals(httpClientItem.Client.Timeout.TotalSeconds, _timeout))
                {
                    httpClientItem.Client.Timeout = new TimeSpan(0, 0, (int)_timeout);
                }

                response           = NetworkCenter.Current.Execute("http", () => httpClientItem.Client.SendAsync(httpMessage).Result);
                request.StatusCode = response.StatusCode;
                response.EnsureSuccessStatusCode();

                Page page;

                if (response.Content.Headers.ContentType != null && !ExcludeMediaTypes.Contains(response.Content.Headers.ContentType.MediaType))
                {
                    if (!spider.Site.DownloadFiles)
                    {
                        Logger.Log(spider.Identity, $"Ignore: {request.Url} because media type is not allowed to download.", Level.Warn);
                        return(new Page(request)
                        {
                            Skip = true
                        });
                    }
                    else
                    {
                        page = SaveFile(request, response, spider);
                    }
                }
                else
                {
                    page = HandleResponse(request, response, spider.Site);

                    if (string.IsNullOrWhiteSpace(page.Content))
                    {
                        Logger.Log(spider.Identity, $"Content is empty: {request.Url}.", Level.Warn);
                    }
                }

                page.TargetUrl = response.RequestMessage.RequestUri.AbsoluteUri;

                return(page);
            }
            catch (DownloadException de)
            {
                Page page = spider.Site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, spider.Site) : new Page(request);

                if (page != null)
                {
                    page.Exception = de;
                }
                Logger.Log(spider.Identity, $"Download {request.Url} failed: {de.Message}", Level.Warn);

                return(page);
            }
            catch (HttpRequestException he)
            {
                Page page = spider.Site.CycleRetryTimes > 0 ? Spider.AddToCycleRetry(request, spider.Site) : new Page(request);
                if (page != null)
                {
                    page.Exception = he;
                }

                Logger.Log(spider.Identity, $"Download {request.Url} failed: {he.Message}.", Level.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Page page = new Page(request)
                {
                    Exception = e,
                    Skip      = true
                };

                Logger.Log(spider.Identity, $"Download {request.Url} failed: {e.Message}.", Level.Error, e);
                return(page);
            }
            finally
            {
                try
                {
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    Logger.Log(spider.Identity, $"Close response fail: {e}", Level.Error, e);
                }
            }
        }
Exemplo n.º 18
0
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (this)
                {
                    if (_webDriver == null)
                    {
                        _webDriver = WebDriverUtil.Open(_browser, _option);
                    }

                    if (!_isLogined && SignIn != null)
                    {
                        _isLogined = SignIn.Handle(_webDriver as RemoteWebDriver);
                        if (!_isLogined)
                        {
                            throw new SpiderException("Login failed. Please check your login codes.");
                        }
                    }
                }

                //中文乱码URL
                Uri    uri     = request.Url;
                string query   = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}";

                var domainUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}";
                var options   = _webDriver.Manage();
                if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies.PairPart.Count > 0)
                {
                    _webDriver.Url = domainUrl;
                    options.Cookies.DeleteAllCookies();
                    foreach (var c in spider.Site.Cookies.PairPart)
                    {
                        options.Cookies.AddCookie(new Cookie(c.Key, c.Value));
                    }
                }

                if (UrlHandler != null)
                {
                    realUrl = UrlHandler(realUrl);
                }

                NetworkCenter.Current.Execute("wd-d", () =>
                {
                    _webDriver.Navigate().GoToUrl(realUrl);

                    NavigateCompeleted?.Handle((RemoteWebDriver)_webDriver);
                });

                Thread.Sleep(_webDriverWaitTime);

                Page page = new Page(request, spider.Site.ContentType, site.RemoveOutboundLinks ? site.Domains : null)
                {
                    Content   = _webDriver.PageSource,
                    TargetUrl = _webDriver.Url,
                    Title     = _webDriver.Title
                };

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);
                return(page);
            }
            catch (DownloadException de)
            {
                Page page = new Page(request, site.ContentType, null)
                {
                    Exception = de
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = Spider.AddToCycleRetry(request, site);
                }
                spider.Log($"下载 {request.Url} 失败: {de.Message}", Core.Infrastructure.LogLevel.Warn);
                return(page);
            }
            catch (HttpRequestException he)
            {
                Page page = new Page(request, site.ContentType, null)
                {
                    Exception = he
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = Spider.AddToCycleRetry(request, site);
                }
                spider.Log($"下载 {request.Url} 失败: {he.Message}", Core.Infrastructure.LogLevel.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Page page = new Page(request, site.ContentType, null)
                {
                    Exception = e
                };
                return(page);
            }
        }