Пример #1
0
        public override Page Download(Request request, ITask task)
        {
            if (task.Site == null)
            {
                return null;
            }

            Site site = task.Site;

            ICollection<int> acceptStatCode;
            Encoding charset = null;
            IDictionary headers = null;
            if (site != null)
            {
                acceptStatCode = site.AcceptStatCode;
                charset = site.Encoding;
                headers = site.GetHeaders();
            }
            else
            {
                acceptStatCode = new HashSet<int> { 200 };
            }
            //Logger.InfoFormat("Downloading page {0}", request.Url);

            int statusCode = 0;

            HttpWebResponse response = null;
            try
            {
                HttpWebRequest httpWebRequest = GetHttpWebRequest(request, site, headers);
                response = (HttpWebResponse)httpWebRequest.GetResponse();
                statusCode = (int)response.StatusCode;
                request.PutExtra(Request.StatusCode, statusCode);
                if (StatusAccept(acceptStatCode, statusCode))
                {
                    Page page = HandleResponse(request, charset, response, statusCode, task);

                    // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                    request.PutExtra(Request.CycleTriedTimes, null);

                    httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue;
                    OnSuccess(request);
                    return page;
                }
                else
                {
                    throw new SpiderExceptoin("Download failed.");
                }
            }
            catch (Exception e)
            {
                if (site.CycleRetryTimes > 0)
                {
                    return AddToCycleRetry(request, site);
                }
                OnError(request, e);
                return null;
            }
            finally
            {
                request.PutExtra(Request.StatusCode, statusCode);
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
                    response?.Close();
                }
                catch (Exception e)
                {
                    Logger.Warn("Close response fail.", e);
                }
            }
        }
        public override Page Download(Request request, ISpider spider)
        {
            if (spider.Site == null)
            {
                return null;
            }

            Site site = spider.Site;

            var acceptStatCodes = site.AcceptStatCode;

            //Logger.InfoFormat("Downloading page {0}", request.Url);

            HttpResponseMessage response = null;
            var proxy = site.GetHttpProxyFromPool();
            request.PutExtra(Request.Proxy, proxy);
            int statusCode = 200;
            try
            {
                if (GeneratePostBody != null)
                {
                    SingleExecutor.Execute(() =>
                    {
                        GeneratePostBody(spider.Site, request);
                    });
                }

                var httpMessage = GenerateHttpRequestMessage(request, site);

                response = RedialManagerUtils.Execute("downloader-download", (m) =>
                {
                    var message = (HttpRequestMessage)m;
                    return httpClient.SendAsync(message).Result;
                }, httpMessage);

                AddRequestCount();

                response.EnsureSuccessStatusCode();
                if (!site.AcceptStatCode.Contains(response.StatusCode))
                {
                    throw new DownloadException($"下载 {request.Url} 失败. Code: {response.StatusCode}");
                }
                statusCode = (int)response.StatusCode;
                request.PutExtra(Request.StatusCode, statusCode);

                Page page = HandleResponse(request, response, statusCode, site);

                // need update
                page.TargetUrl = request.Url.ToString();

                //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html"));

                // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度
                // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开
                ValidatePage(page, spider);

                // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务
                //request.PutExtra(Request.CycleTriedTimes, null);

                //#if !NET_CORE
                //					httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue;
                //#endif

                return page;

                //正常结果在上面已经Return了, 到此处必然是下载失败的值.
                //throw new SpiderExceptoin("Download failed.");
            }
            catch (RedialException)
            {
                throw;
            }
            catch (Exception e)
            {
                Page page = new Page(request, site.ContentType) { Exception = e };

                ValidatePage(page, spider);
                throw;
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    var logger = LogUtils.GetLogger(spider);
                    logger.Warn("Close response fail.", e);
                }
            }
        }
Пример #3
0
        protected HttpWebRequest GetHttpWebRequest(Request request, Site site, IDictionary headers)
        {
            if (site == null) return null;

            HttpWebRequest httpWebRequest = SelectRequestMethod(request);

            httpWebRequest.UserAgent = site.UserAgent ??
                                       "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0Mozilla/5.0 (Windows NT 10.0; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0";

            if (site.IsUseGzip)
            {
                httpWebRequest.Headers.Add("Accept-Encoding", "gzip");
            }

            // headers
            if (headers != null)
            {
                var enumerator = headers.GetEnumerator();
                while (enumerator.MoveNext())
                {
                    var key = enumerator.Key;
                    var value = enumerator.Value;
                    httpWebRequest.Headers.Add(key.ToString(), value.ToString());
                }
            }

            // cookie
            httpWebRequest = GeneratorCookie(httpWebRequest, site);

            //check:
            httpWebRequest.Timeout = site.Timeout;
            httpWebRequest.ContinueTimeout = site.Timeout;
            httpWebRequest.ReadWriteTimeout = site.Timeout;
            httpWebRequest.AllowAutoRedirect = true;

            if (headers != null)
            {
                foreach (DictionaryEntry entry in headers)
                {
                    httpWebRequest.Headers.Add(entry.Key.ToString(), entry.Value.ToString());
                }
            }

            if (site.GetHttpProxyPool().Enable)
            {
                HttpHost host = site.GetHttpProxyFromPool();
                httpWebRequest.Proxy = new WebProxy(host.Host, host.Port);
                request.PutExtra(Request.Proxy, host);
            }

            return httpWebRequest;
        }