Пример #1
0
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (this)
                {
                    if (_webDriver == null)
                    {
                        _webDriver = WebDriverUtil.Open(Browser.Chrome, _option);
                    }
                    if (!_isLogined && Login != null)
                    {
                        _isLogined = Login.Invoke(_webDriver as RemoteWebDriver);
                        if (!_isLogined)
                        {
                            throw new SpiderException("Login failed. Please check your login codes.");
                        }
                    }

                    //中文乱码URL
                    Uri    uri     = request.Url;
                    string query   = uri.Query;
                    string realUrl = uri.Scheme + "://" + uri.DnsSafeHost + ":" + uri.Port + uri.AbsolutePath + (string.IsNullOrEmpty(query) ? "" : ("?" + HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))));

                    if (UrlFormat != null)
                    {
                        realUrl = UrlFormat(realUrl);
                    }

                    NetworkCenter.Current.Execute("fid-d", () =>
                    {
                        _webDriver.Navigate().GoToUrl(realUrl);
                    });

                    Thread.Sleep(_webDriverWaitTime);

                    AfterNavigate?.Invoke((RemoteWebDriver)_webDriver);

                    Page page = new Page(request, spider.Site.ContentType, site.RemoveOutboundLinks ? site.Domain : null);
                    page.Content = _fiddlerClient.ResponseBodyString;
                    _fiddlerClient.Clear();
                    page.TargetUrl = _webDriver.Url;
                    page.Title     = _webDriver.Title;
                    // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                    request.PutExtra(Request.CycleTriedTimes, null);

                    return(page);
                }
            }
            catch (DownloadException)
            {
                throw;
            }
            catch (Exception e)
            {
                Page page = new Page(request, site.ContentType, null)
                {
                    Exception = e
                };
                return(page);
            }
        }
Пример #2
0
        /// <summary>
        /// 下载工作的具体实现
        /// </summary>
        /// <param name="request">请求信息</param>
        /// <param name="spider">爬虫</param>
        /// <returns>页面数据</returns>
        protected override Task <Page> DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (_locker)
                {
                    if (_webDriver == null)
                    {
                        _webDriver = WebDriverUtil.Open(_browser, _option);

                        if (_domains != null)
                        {
                            foreach (var domain in _domains)
                            {
                                var cookies = CookieContainer.GetCookies(new Uri(domain));
                                foreach (System.Net.Cookie cookie in cookies)
                                {
                                    AddCookieToDownloadClient(cookie);
                                }
                            }
                        }

                        if (!_isLogined && CookieInjector != null)
                        {
                            if (CookieInjector is WebDriverLoginHandler webdriverLoginHandler)
                            {
                                webdriverLoginHandler.Driver = _webDriver as RemoteWebDriver;
                            }

                            CookieInjector.Inject(this, spider);
                            _isLogined = true;
                        }
                    }
                }

                //#if NET_CORE
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{WebUtility.UrlEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#else
                //				string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                //#endif
                //				string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}";

//				var domainUrl =
//					$"{request.Uri.Scheme}://{request.Uri.DnsSafeHost}{(request.Uri.Port == 80 ? "" : ":" + request.Uri.Port)}";

                string realUrl = request.Url;

                NetworkCenter.Current.Execute("webdriver-download", () =>
                {
                    _webDriver.Navigate().GoToUrl(realUrl);

                    if (WebDriverHandlers != null)
                    {
                        foreach (var handler in WebDriverHandlers)
                        {
                            handler.Handle((RemoteWebDriver)_webDriver);
                        }
                    }
                });

                Thread.Sleep(_webDriverWaitTime);

                Page page = new Page(request)
                {
                    Content   = _webDriver.PageSource,
                    TargetUrl = _webDriver.Url
                };

                return(Task.FromResult(page));
            }
            catch (DownloadException de)
            {
                Page page = new Page(request)
                {
                    Exception = de
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = site.AddToCycleRetry(request);
                }

                spider.Logger.Error($"下载 {request.Url} 失败: {de.Message}.");
                return(Task.FromResult(page));
            }
            catch (Exception e)
            {
                spider.Logger.Error($"下载 {request.Url} 失败: {e.Message}.");
                Page page = new Page(request)
                {
                    Exception = e
                };
                return(Task.FromResult(page));
            }
        }