async Task <HttpWebResponse> GetSimpleResponseAsync(ScrapeRequest scrapeRequest) { HttpWebRequest httpWebRequest = this.BuildHttpWebRequest(scrapeRequest); var response = await httpWebRequest.GetResponseAsync(); return(TrackResponseAndPromote(response)); }
HttpWebResponse GetSimpleResponse(ScrapeRequest scrapeRequest) { HttpWebRequest httpWebRequest = this.BuildHttpWebRequest(scrapeRequest); var response = httpWebRequest.GetResponse(); return(TrackResponseAndPromote(response)); }
RequestSettings ApplyCustomSettingsToDefaultSettings(ScrapeRequest request) { var settings = this.Defaults.Clone(); foreach (var adjustment in request.AdjustSettings) { adjustment(settings); } return(settings); }
/// <summary> /// Gets response body as a string. /// wraps any WebException with a ScrapeException /// </summary> /// <returns>a string. Never null.</returns> public virtual async Task <string> GetResponseBodyAsync(ScrapeRequest request) { try { using (HttpWebResponse response = await this.GetFinalWebResponseWithRedirectsAsync(request)) { return(await ReadNonNullResponseAsync(response)); } } catch (WebException wex) { string content = ReadPossibleNullResponse(wex.Response); if (content != null && this.ReturnWebExceptionResponse) { return(content); } throw new ScrapeException(wex, request, content); } }
HttpWebRequest BuildHttpWebRequest(ScrapeRequest request) { // build next request HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(request.Uri); httpRequest.AllowAutoRedirect = this.UseFrameworkRedirect; // we can do this manually httpRequest.CookieContainer = this._cookieJar; httpRequest.CachePolicy = this.CachePolicy; RequestSettings settings = ApplyCustomSettingsToDefaultSettings(request); httpRequest.Accept = settings.Accept; httpRequest.UserAgent = settings.UserAgent; httpRequest.ProtocolVersion = settings.ProtocolVersion; httpRequest.KeepAlive = settings.KeepAlive; httpRequest.Timeout = settings.Timeout; httpRequest.Credentials = settings.Credentials; httpRequest.Referer = settings.Referrer ?? this._lastPage; foreach (var pair in settings.Headers) { httpRequest.Headers[pair.Key] = pair.Value; } // httpRequest.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip; // request.Headers["Accept-Encoding"] = "gzip,deflate"; // don't set this directly without using the above line(I guess) // convert it to a post message if needed IPostData postData = request.PostData; if (postData != null) { byte[] postBytes = postData.PostBytes; // set request headers as appropriat httpRequest.Method = "POST"; httpRequest.ContentLength = postBytes.Length; httpRequest.ContentType = postData.ContentType; // feed post data into the request using (System.IO.Stream requestStream = httpRequest.GetRequestStream()) { requestStream.Write(postBytes, 0, postBytes.Length); } } return(httpRequest); }
/// <summary> /// Writes the response to a Stream /// wraps any WebException with a ScrapeException /// </summary> /// <returns>Response content type of successful attempts, or response body of unsuccessful attempts..</returns> public virtual async Task <string> WriteResponseToStreamAsync(ScrapeRequest request, Stream oStream) { try { using (var response = await this.GetFinalWebResponseWithRedirectsAsync(request)) { using (var responseStream = response.GetResponseStream()) await responseStream.CopyToAsync(oStream); return(response.ContentType); } } catch (WebException wex) { string responseBody = ReadPossibleNullResponse(wex.Response); if (this.ReturnWebExceptionResponse) { return(responseBody); } throw new ScrapeException(wex, request, responseBody); } }
/// <summary> /// Performs redirects as needed and returns the response of the last page. /// </summary> async Task <HttpWebResponse> GetFinalWebResponseWithRedirectsAsync(ScrapeRequest scrapeRequest) { ScrapeRequest currentScrapeRequest = scrapeRequest; // changes value, doesn't feel right to change parameter HttpWebResponse response = null; try { response = await GetSimpleResponseAsync(currentScrapeRequest); int tries = this.MaxForwards; while (!IgnoreRedirection && response.Headers["Location"] != null) { if (--tries == 0) { throw new System.Net.WebException("Too many page forwarding"); } // build next request from response currentScrapeRequest = new ScrapeRequest(new Uri(currentScrapeRequest.Uri, response.Headers["Location"])); // close request response.Close(); response = null; response = await GetSimpleResponseAsync(currentScrapeRequest); } return(response); } catch { // cleanup response if this is an exception only if (response != null) { response.Close(); response = null; } throw; } }
/// <summary> /// Wraps a WebException attaching the IScrapeRequest that generated the error. /// </summary> public ScrapeException(WebException inner, ScrapeRequest request, string responseBody) : base("", inner) { this.Request = request; this.ResponseBody = responseBody; }
/// <summary> /// Saves the document retrieved from the web response to a flie. /// wraps any WebException with a ScrapeException /// </summary> /// <returns>HTML formatted Content type (image/jpeg, application/xml, etc)</returns> /// <example>SaveResponse("http://www.google.com/", null, 10000, "C:\\temp\\temp.html", new Regex("www.google.com"),null );</example> public virtual async Task <string> WriteResponseToFileAsync(ScrapeRequest request, string fileName) { using (System.IO.FileStream fileStream = new FileStream(fileName, FileMode.Append)){ return(await this.WriteResponseToStreamAsync(request, fileStream)); } // no need to try/catch because GetResponseBody does that for us. }
/// <summary> /// wraps any WebException with a ScrapeException /// throws ParseException if problem parsing HtmlDocument /// </summary> /// <remarks> /// Thin wrapper around GetResponseBodyAsync(). /// This isn't virtual because you can override GetResponseBodyAsync. /// </remarks> public async Task <HtmlDocument> GetDocumentAsync(ScrapeRequest request) { string responseBody = await this.GetResponseBodyAsync(request); return(new HtmlDocument(responseBody)); // no need to try/catch because parsing is deferred. }
/// <summary> /// wraps any WebException with a ScrapeException /// throws ParseException if problem parsing HtmlDocument /// </summary> /// <remarks> /// Thin wrapper around GetResponseBody(). /// This isn't virtual because you can override GetResponseBody. /// </remarks> public HtmlDocument GetDocument(ScrapeRequest request) { string responseBody = this.GetResponseBody(request); return(new HtmlDocument(responseBody, request.Uri)); // no need to try/catch because parsing is deferred. }