예제 #1
0
        protected void ValidatePage(Page page, ISpider spider)
        {
            //customer verify
            if (DownloadValidation != null)
            {
                var validatResult = DownloadValidation(page);

                switch (validatResult)
                {
                case DownloadValidationResult.Failed:
                {
                    throw new RedialException("Customize validate failed.");
                }

                case DownloadValidationResult.FailedAndNeedRedial:
                {
                    if (RedialManagerUtils.RedialManager == null)
                    {
                        throw new RedialException("RedialManager is null.");
                    }

                    RedialManagerUtils.RedialManager?.Redial();
                    throw new RedialException("Download failed and Redial already.");
                }

                case DownloadValidationResult.Success:
                {
                    break;
                }

                case DownloadValidationResult.FailedAndNeedUpdateCookie:
                {
                    SingleExecutor.Execute(() =>
                        {
                            CustomizeCookie?.Invoke();
                        });
                    throw new RedialException("Cookie validate failed.");
                }

                case DownloadValidationResult.FailedAndNeedRetryOrWait:
                {
                    throw new SpiderExceptoin("Need retry.");
                }

                case DownloadValidationResult.FailedAndNeedWaitToVerifyCode:
                {
                    throw new SpiderExceptoin("Need Verify Code.");
                }

                case DownloadValidationResult.Miss:
                {
                    page.IsSkip = true;
                    break;
                }
                }
            }
        }
        public override Page Download(Request request, ISpider spider)
        {
            if (spider.Site == null)
            {
                return(null);
            }

            Site site = spider.Site;

            var acceptStatCodes = site.AcceptStatCode;

            //Logger.InfoFormat("Downloading page {0}", request.Url);

            HttpResponseMessage response = null;
            var proxy = site.GetHttpProxyFromPool();

            request.PutExtra(Request.Proxy, proxy);
            int statusCode = 200;

            try
            {
                if (PostBodyGenerator != null)
                {
                    SingleExecutor.Execute(() =>
                    {
                        PostBodyGenerator(spider.Site, request);
                    });
                }

                var httpMessage = GenerateHttpRequestMessage(request, site);

                response = RedialManagerUtils.Execute("downloader-download", (m) =>
                {
                    var message = (HttpRequestMessage)m;
                    return(httpClient.SendAsync(message).Result);
                }, httpMessage);

                AddRequestCount();

                response.EnsureSuccessStatusCode();
                if (!site.AcceptStatCode.Contains(response.StatusCode))
                {
                    throw new DownloadException($"下载 {request.Url} 失败. Code: {response.StatusCode}");
                }
                statusCode = (int)response.StatusCode;
                request.PutExtra(Request.StatusCode, statusCode);

                Page page = HandleResponse(request, response, statusCode, site);

                // need update
                page.TargetUrl = request.Url.ToString();

                //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html"));

                // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度
                // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开
                ValidatePage(page, spider);

                // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);

                //#if !NET_CORE
                //					httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue;
                //#endif

                return(page);

                //正常结果在上面已经Return了, 到此处必然是下载失败的值.
                //throw new SpiderExceptoin("Download failed.");
            }
            catch (RedialException)
            {
                throw;
            }
            catch (Exception e)
            {
                Page page = new Page(request, site.ContentType)
                {
                    Exception = e
                };

                ValidatePage(page, spider);
                throw;
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    spider.Logger.Warn("Close response fail.", e);
                }
            }
        }
예제 #3
0
        public override Page Download(Request request, ISpider spider)
        {
            if (spider.Site == null)
            {
                return(null);
            }

            Site site = spider.Site;

            ICollection <int> acceptStatCode = site.AcceptStatCode;
            var charset = site.Encoding;

            //Logger.InfoFormat("Downloading page {0}", request.Url);

            int statusCode = 0;

            HttpWebResponse response = null;

            try
            {
                if (CustomizeRequestBeforeGenerate != null)
                {
                    SingleExecutor.Execute(() =>
                    {
                        CustomizeRequestBeforeGenerate(request);
                    });
                }

                var httpWebRequest = GetHttpWebRequest(request, site);

                response = RedialManagerUtils.Execute("downloader-download", h =>
                {
                    HttpWebRequest tmpHttpWebRequest = (HttpWebRequest)h;

                    if (HttpConstant.Method.Post.Equals(request.Method) && !string.IsNullOrEmpty(request.PostBody))
                    {
                        var data = spider.Site.Encoding.GetBytes(request.PostBody);
#if !NET_CORE
                        tmpHttpWebRequest.ContentLength = data.Length;

                        using (Stream newStream = tmpHttpWebRequest.GetRequestStream())
                        {
                            newStream.Write(data, 0, data.Length);
                            newStream.Close();
                        }
#else
                        using (Stream newStream = tmpHttpWebRequest.GetRequestStreamAsync().Result)
                        {
                            newStream.Write(data, 0, data.Length);
                            newStream.Dispose();
                        }
#endif
                    }

#if !NET_CORE
                    return((HttpWebResponse)tmpHttpWebRequest?.GetResponse());
#else
                    return((HttpWebResponse)tmpHttpWebRequest?.GetResponseAsync().Result);
#endif
                }, httpWebRequest);

                statusCode = (int)response.StatusCode;
                request.PutExtra(Request.StatusCode, statusCode);
                if (StatusAccept(acceptStatCode, statusCode))
                {
                    Page page = HandleResponse(request, charset, response, statusCode, site);

                    //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html"));

                    // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度
                    // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开
                    ValidatePage(page);

                    // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                    request.PutExtra(Request.CycleTriedTimes, null);

#if !NET_CORE
                    httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue;
#endif

                    return(page);
                }
                else
                {
                    throw new SpiderExceptoin("Download failed.");
                }

                //正常结果在上面已经Return了, 到此处必然是下载失败的值.
                //throw new SpiderExceptoin("Download failed.");
            }
            catch (Exception e)
            {
                if (!(e is RedialException))
                {
                    Page page = new Page(request, site.ContentType)
                    {
                        Exception = e
                    };

                    ValidatePage(page);
                }

                throw;
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
#if !NET_CORE
                    response?.Close();
#else
                    response?.Dispose();
#endif
                }
                catch (Exception e)
                {
                    Logger.Warn("Close response fail.", e);
                }
                request.PutExtra(Request.StatusCode, statusCode);
            }
        }