protected void ValidatePage(Page page, ISpider spider) { //customer verify if (DownloadValidation != null) { var validatResult = DownloadValidation(page); switch (validatResult) { case DownloadValidationResult.Failed: { throw new RedialException("Customize validate failed."); } case DownloadValidationResult.FailedAndNeedRedial: { if (RedialManagerUtils.RedialManager == null) { throw new RedialException("RedialManager is null."); } RedialManagerUtils.RedialManager?.Redial(); throw new RedialException("Download failed and Redial already."); } case DownloadValidationResult.Success: { break; } case DownloadValidationResult.FailedAndNeedUpdateCookie: { SingleExecutor.Execute(() => { CustomizeCookie?.Invoke(); }); throw new RedialException("Cookie validate failed."); } case DownloadValidationResult.FailedAndNeedRetryOrWait: { throw new SpiderExceptoin("Need retry."); } case DownloadValidationResult.FailedAndNeedWaitToVerifyCode: { throw new SpiderExceptoin("Need Verify Code."); } case DownloadValidationResult.Miss: { page.IsSkip = true; break; } } } }
public override Page Download(Request request, ISpider spider) { if (spider.Site == null) { return(null); } Site site = spider.Site; var acceptStatCodes = site.AcceptStatCode; //Logger.InfoFormat("Downloading page {0}", request.Url); HttpResponseMessage response = null; var proxy = site.GetHttpProxyFromPool(); request.PutExtra(Request.Proxy, proxy); int statusCode = 200; try { if (PostBodyGenerator != null) { SingleExecutor.Execute(() => { PostBodyGenerator(spider.Site, request); }); } var httpMessage = GenerateHttpRequestMessage(request, site); response = RedialManagerUtils.Execute("downloader-download", (m) => { var message = (HttpRequestMessage)m; return(httpClient.SendAsync(message).Result); }, httpMessage); AddRequestCount(); response.EnsureSuccessStatusCode(); if (!site.AcceptStatCode.Contains(response.StatusCode)) { throw new DownloadException($"下载 {request.Url} 失败. Code: {response.StatusCode}"); } statusCode = (int)response.StatusCode; request.PutExtra(Request.StatusCode, statusCode); Page page = HandleResponse(request, response, statusCode, site); // need update page.TargetUrl = request.Url.ToString(); //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html")); // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度 // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开 ValidatePage(page, spider); // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); //#if !NET_CORE // httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue; //#endif return(page); //正常结果在上面已经Return了, 到此处必然是下载失败的值. //throw new SpiderExceptoin("Download failed."); } catch (RedialException) { throw; } catch (Exception e) { Page page = new Page(request, site.ContentType) { Exception = e }; ValidatePage(page, spider); throw; } finally { // 先Close Response, 避免前面语句异常导致没有关闭. try { //ensure the connection is released back to pool //check: //EntityUtils.consume(httpResponse.getEntity()); response?.Dispose(); } catch (Exception e) { spider.Logger.Warn("Close response fail.", e); } } }
public override Page Download(Request request, ISpider spider) { if (spider.Site == null) { return(null); } Site site = spider.Site; ICollection <int> acceptStatCode = site.AcceptStatCode; var charset = site.Encoding; //Logger.InfoFormat("Downloading page {0}", request.Url); int statusCode = 0; HttpWebResponse response = null; try { if (CustomizeRequestBeforeGenerate != null) { SingleExecutor.Execute(() => { CustomizeRequestBeforeGenerate(request); }); } var httpWebRequest = GetHttpWebRequest(request, site); response = RedialManagerUtils.Execute("downloader-download", h => { HttpWebRequest tmpHttpWebRequest = (HttpWebRequest)h; if (HttpConstant.Method.Post.Equals(request.Method) && !string.IsNullOrEmpty(request.PostBody)) { var data = spider.Site.Encoding.GetBytes(request.PostBody); #if !NET_CORE tmpHttpWebRequest.ContentLength = data.Length; using (Stream newStream = tmpHttpWebRequest.GetRequestStream()) { newStream.Write(data, 0, data.Length); newStream.Close(); } #else using (Stream newStream = tmpHttpWebRequest.GetRequestStreamAsync().Result) { newStream.Write(data, 0, data.Length); newStream.Dispose(); } #endif } #if !NET_CORE return((HttpWebResponse)tmpHttpWebRequest?.GetResponse()); #else return((HttpWebResponse)tmpHttpWebRequest?.GetResponseAsync().Result); #endif }, httpWebRequest); statusCode = (int)response.StatusCode; request.PutExtra(Request.StatusCode, statusCode); if (StatusAccept(acceptStatCode, statusCode)) { Page page = HandleResponse(request, charset, response, statusCode, site); //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html")); // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度 // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开 ValidatePage(page); // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); #if !NET_CORE httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue; #endif return(page); } else { throw new SpiderExceptoin("Download failed."); } //正常结果在上面已经Return了, 到此处必然是下载失败的值. //throw new SpiderExceptoin("Download failed."); } catch (Exception e) { if (!(e is RedialException)) { Page page = new Page(request, site.ContentType) { Exception = e }; ValidatePage(page); } throw; } finally { // 先Close Response, 避免前面语句异常导致没有关闭. try { //ensure the connection is released back to pool //check: //EntityUtils.consume(httpResponse.getEntity()); #if !NET_CORE response?.Close(); #else response?.Dispose(); #endif } catch (Exception e) { Logger.Warn("Close response fail.", e); } request.PutExtra(Request.StatusCode, statusCode); } }