Beispiel #1
0
        async Task <HttpWebResponse> GetSimpleResponseAsync(ScrapeRequest scrapeRequest)
        {
            HttpWebRequest httpWebRequest = this.BuildHttpWebRequest(scrapeRequest);
            var            response       = await httpWebRequest.GetResponseAsync();

            return(TrackResponseAndPromote(response));
        }
Beispiel #2
0
        HttpWebResponse GetSimpleResponse(ScrapeRequest scrapeRequest)
        {
            HttpWebRequest httpWebRequest = this.BuildHttpWebRequest(scrapeRequest);
            var            response       = httpWebRequest.GetResponse();

            return(TrackResponseAndPromote(response));
        }
Beispiel #3
0
        RequestSettings ApplyCustomSettingsToDefaultSettings(ScrapeRequest request)
        {
            var settings = this.Defaults.Clone();

            foreach (var adjustment in request.AdjustSettings)
            {
                adjustment(settings);
            }
            return(settings);
        }
Beispiel #4
0
 /// <summary>
 /// Gets response body as a string.
 /// wraps any WebException with a ScrapeException
 /// </summary>
 /// <returns>a string.  Never null.</returns>
 public virtual async Task <string> GetResponseBodyAsync(ScrapeRequest request)
 {
     try {
         using (HttpWebResponse response = await this.GetFinalWebResponseWithRedirectsAsync(request)) {
             return(await ReadNonNullResponseAsync(response));
         }
     } catch (WebException wex) {
         string content = ReadPossibleNullResponse(wex.Response);
         if (content != null && this.ReturnWebExceptionResponse)
         {
             return(content);
         }
         throw new ScrapeException(wex, request, content);
     }
 }
Beispiel #5
0
        HttpWebRequest BuildHttpWebRequest(ScrapeRequest request)
        {
            // build next request
            HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(request.Uri);

            httpRequest.AllowAutoRedirect = this.UseFrameworkRedirect;              // we can do this manually
            httpRequest.CookieContainer   = this._cookieJar;
            httpRequest.CachePolicy       = this.CachePolicy;

            RequestSettings settings = ApplyCustomSettingsToDefaultSettings(request);

            httpRequest.Accept          = settings.Accept;
            httpRequest.UserAgent       = settings.UserAgent;
            httpRequest.ProtocolVersion = settings.ProtocolVersion;
            httpRequest.KeepAlive       = settings.KeepAlive;
            httpRequest.Timeout         = settings.Timeout;
            httpRequest.Credentials     = settings.Credentials;
            httpRequest.Referer         = settings.Referrer ?? this._lastPage;

            foreach (var pair in settings.Headers)
            {
                httpRequest.Headers[pair.Key] = pair.Value;
            }

            // httpRequest.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
            // request.Headers["Accept-Encoding"] = "gzip,deflate"; // don't set this directly without using the above line(I guess)


            // convert it to a post message if needed
            IPostData postData = request.PostData;

            if (postData != null)
            {
                byte[] postBytes = postData.PostBytes;

                // set request headers as appropriat
                httpRequest.Method        = "POST";
                httpRequest.ContentLength = postBytes.Length;
                httpRequest.ContentType   = postData.ContentType;

                // feed post data into the request
                using (System.IO.Stream requestStream = httpRequest.GetRequestStream()) {
                    requestStream.Write(postBytes, 0, postBytes.Length);
                }
            }

            return(httpRequest);
        }
Beispiel #6
0
 /// <summary>
 /// Writes the response to a Stream
 /// wraps any WebException with a ScrapeException
 /// </summary>
 /// <returns>Response content type of successful attempts, or response body of unsuccessful attempts..</returns>
 public virtual async Task <string> WriteResponseToStreamAsync(ScrapeRequest request, Stream oStream)
 {
     try {
         using (var response = await this.GetFinalWebResponseWithRedirectsAsync(request)) {
             using (var responseStream = response.GetResponseStream())
                 await responseStream.CopyToAsync(oStream);
             return(response.ContentType);
         }
     }
     catch (WebException wex) {
         string responseBody = ReadPossibleNullResponse(wex.Response);
         if (this.ReturnWebExceptionResponse)
         {
             return(responseBody);
         }
         throw new ScrapeException(wex, request, responseBody);
     }
 }
Beispiel #7
0
        /// <summary>
        /// Performs redirects as needed and returns the response of the last page.
        /// </summary>
        async Task <HttpWebResponse> GetFinalWebResponseWithRedirectsAsync(ScrapeRequest scrapeRequest)
        {
            ScrapeRequest currentScrapeRequest = scrapeRequest;             // changes value, doesn't feel right to change parameter

            HttpWebResponse response = null;

            try {
                response = await GetSimpleResponseAsync(currentScrapeRequest);

                int tries = this.MaxForwards;
                while (!IgnoreRedirection && response.Headers["Location"] != null)
                {
                    if (--tries == 0)
                    {
                        throw new System.Net.WebException("Too many page forwarding");
                    }

                    // build next request from response
                    currentScrapeRequest = new ScrapeRequest(new Uri(currentScrapeRequest.Uri, response.Headers["Location"]));

                    // close request
                    response.Close();
                    response = null;

                    response = await GetSimpleResponseAsync(currentScrapeRequest);
                }

                return(response);
            } catch {
                // cleanup response if this is an exception only
                if (response != null)
                {
                    response.Close();
                    response = null;
                }
                throw;
            }
        }
Beispiel #8
0
 /// <summary>
 /// Wraps a WebException attaching the IScrapeRequest that generated the error.
 /// </summary>
 public ScrapeException(WebException inner, ScrapeRequest request, string responseBody)
     : base("", inner)
 {
     this.Request      = request;
     this.ResponseBody = responseBody;
 }
Beispiel #9
0
 /// <summary>
 /// Saves the document retrieved from the web response to a flie.
 /// wraps any WebException with a ScrapeException
 /// </summary>
 /// <returns>HTML formatted Content type (image/jpeg, application/xml, etc)</returns>
 /// <example>SaveResponse("http://www.google.com/", null, 10000, "C:\\temp\\temp.html", new Regex("www.google.com"),null );</example>
 public virtual async Task <string> WriteResponseToFileAsync(ScrapeRequest request, string fileName)
 {
     using (System.IO.FileStream fileStream = new FileStream(fileName, FileMode.Append)){
         return(await this.WriteResponseToStreamAsync(request, fileStream));
     }             // no need to try/catch because GetResponseBody does that for us.
 }
Beispiel #10
0
        /// <summary>
        /// wraps any WebException with a ScrapeException
        /// throws ParseException if problem parsing HtmlDocument
        /// </summary>
        /// <remarks>
        /// Thin wrapper around GetResponseBodyAsync().
        /// This isn't virtual because you can override GetResponseBodyAsync.
        /// </remarks>
        public async Task <HtmlDocument> GetDocumentAsync(ScrapeRequest request)
        {
            string responseBody = await this.GetResponseBodyAsync(request);

            return(new HtmlDocument(responseBody));              // no need to try/catch because parsing is deferred.
        }
Beispiel #11
0
        /// <summary>
        /// wraps any WebException with a ScrapeException
        /// throws ParseException if problem parsing HtmlDocument
        /// </summary>
        /// <remarks>
        /// Thin wrapper around GetResponseBody().
        /// This isn't virtual because you can override GetResponseBody.
        /// </remarks>
        public HtmlDocument GetDocument(ScrapeRequest request)
        {
            string responseBody = this.GetResponseBody(request);

            return(new HtmlDocument(responseBody, request.Uri));              // no need to try/catch because parsing is deferred.
        }