public override Page Download(Request request, ITask task) { if (task.Site == null) { return null; } Site site = task.Site; ICollection<int> acceptStatCode; Encoding charset = null; IDictionary headers = null; if (site != null) { acceptStatCode = site.AcceptStatCode; charset = site.Encoding; headers = site.GetHeaders(); } else { acceptStatCode = new HashSet<int> { 200 }; } //Logger.InfoFormat("Downloading page {0}", request.Url); int statusCode = 0; HttpWebResponse response = null; try { HttpWebRequest httpWebRequest = GetHttpWebRequest(request, site, headers); response = (HttpWebResponse)httpWebRequest.GetResponse(); statusCode = (int)response.StatusCode; request.PutExtra(Request.StatusCode, statusCode); if (StatusAccept(acceptStatCode, statusCode)) { Page page = HandleResponse(request, charset, response, statusCode, task); // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue; OnSuccess(request); return page; } else { throw new SpiderExceptoin("Download failed."); } } catch (Exception e) { if (site.CycleRetryTimes > 0) { return AddToCycleRetry(request, site); } OnError(request, e); return null; } finally { request.PutExtra(Request.StatusCode, statusCode); try { //ensure the connection is released back to pool //check: //EntityUtils.consume(httpResponse.getEntity()); response?.Close(); } catch (Exception e) { Logger.Warn("Close response fail.", e); } } }
public override Page Download(Request request, ISpider spider) { if (spider.Site == null) { return null; } Site site = spider.Site; var acceptStatCodes = site.AcceptStatCode; //Logger.InfoFormat("Downloading page {0}", request.Url); HttpResponseMessage response = null; var proxy = site.GetHttpProxyFromPool(); request.PutExtra(Request.Proxy, proxy); int statusCode = 200; try { if (GeneratePostBody != null) { SingleExecutor.Execute(() => { GeneratePostBody(spider.Site, request); }); } var httpMessage = GenerateHttpRequestMessage(request, site); response = RedialManagerUtils.Execute("downloader-download", (m) => { var message = (HttpRequestMessage)m; return httpClient.SendAsync(message).Result; }, httpMessage); AddRequestCount(); response.EnsureSuccessStatusCode(); if (!site.AcceptStatCode.Contains(response.StatusCode)) { throw new DownloadException($"下载 {request.Url} 失败. Code: {response.StatusCode}"); } statusCode = (int)response.StatusCode; request.PutExtra(Request.StatusCode, statusCode); Page page = HandleResponse(request, response, statusCode, site); // need update page.TargetUrl = request.Url.ToString(); //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html")); // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度 // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开 ValidatePage(page, spider); // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务 //request.PutExtra(Request.CycleTriedTimes, null); //#if !NET_CORE // httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue; //#endif return page; //正常结果在上面已经Return了, 到此处必然是下载失败的值. //throw new SpiderExceptoin("Download failed."); } catch (RedialException) { throw; } catch (Exception e) { Page page = new Page(request, site.ContentType) { Exception = e }; ValidatePage(page, spider); throw; } finally { // 先Close Response, 避免前面语句异常导致没有关闭. try { //ensure the connection is released back to pool //check: //EntityUtils.consume(httpResponse.getEntity()); response?.Dispose(); } catch (Exception e) { var logger = LogUtils.GetLogger(spider); logger.Warn("Close response fail.", e); } } }
protected HttpWebRequest GetHttpWebRequest(Request request, Site site, IDictionary headers) { if (site == null) return null; HttpWebRequest httpWebRequest = SelectRequestMethod(request); httpWebRequest.UserAgent = site.UserAgent ?? "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0Mozilla/5.0 (Windows NT 10.0; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0"; if (site.IsUseGzip) { httpWebRequest.Headers.Add("Accept-Encoding", "gzip"); } // headers if (headers != null) { var enumerator = headers.GetEnumerator(); while (enumerator.MoveNext()) { var key = enumerator.Key; var value = enumerator.Value; httpWebRequest.Headers.Add(key.ToString(), value.ToString()); } } // cookie httpWebRequest = GeneratorCookie(httpWebRequest, site); //check: httpWebRequest.Timeout = site.Timeout; httpWebRequest.ContinueTimeout = site.Timeout; httpWebRequest.ReadWriteTimeout = site.Timeout; httpWebRequest.AllowAutoRedirect = true; if (headers != null) { foreach (DictionaryEntry entry in headers) { httpWebRequest.Headers.Add(entry.Key.ToString(), entry.Value.ToString()); } } if (site.GetHttpProxyPool().Enable) { HttpHost host = site.GetHttpProxyFromPool(); httpWebRequest.Proxy = new WebProxy(host.Host, host.Port); request.PutExtra(Request.Proxy, host); } return httpWebRequest; }