public WebPage(ScrapingBrowser browser, Uri absoluteUrl, bool autoDownloadPagesResources, RawRequest rawRequest, RawResponse rawResponse, Encoding encoding, bool autoDetectCharsetEncoding) { this.browser = browser; this.absoluteUrl = absoluteUrl; this.rawRequest = rawRequest; this.rawResponse = rawResponse; this.autoDetectCharsetEncoding = autoDetectCharsetEncoding; Encoding = encoding; content = Encoding.GetString(rawResponse.Body); resources = new List <WebResource>(); LoadHtml(); if (autoDownloadPagesResources) { LoadBaseUrl(); DownloadResources(); } }
private WebPage GetResponse(Uri url, HttpWebRequest request, int iteration, byte[] requestBody) { string content; var response = GetWebResponse(url, request); var responseStream = response.GetResponseStream(); var headers = request.Headers.AllKeys.Select(k => new KeyValuePair <string, string>(k, request.Headers[k])).ToList(); if (responseStream == null) { return(new WebPage(this, url, AutoDownloadPagesResources, new RawRequest(request.Method, request.RequestUri, request.ProtocolVersion, headers, requestBody, Encoding), new RawResponse(response.ProtocolVersion, response.StatusCode, response.StatusDescription, response.Headers, new byte[0], Encoding), Encoding, AutoDetectCharsetEncoding)); } var body = new MemoryStream(); responseStream.CopyTo(body); responseStream.Close(); body.Position = 0; //using (var reader = new StreamReader(responseStream)) //{ // content = reader.ReadToEnd(); //} content = Encoding.GetString(body.ToArray()); body.Position = 0; var rawRequest = new RawRequest(request.Method, request.RequestUri, request.ProtocolVersion, headers, requestBody, Encoding); var webPage = new WebPage(this, url, AutoDownloadPagesResources, rawRequest, new RawResponse(response.ProtocolVersion, response.StatusCode, response.StatusDescription, response.Headers, body.ToArray(), Encoding), Encoding, AutoDetectCharsetEncoding); if (AllowMetaRedirect && !string.IsNullOrEmpty(response.ContentType) && response.ContentType.Contains("html") && iteration < 10) { var html = content.ToHtmlNode(); var meta = html.CssSelect("meta") .FirstOrDefault(p => p.Attributes != null && p.Attributes.HasKeyIgnoreCase("HTTP-EQUIV") && p.Attributes.GetIgnoreCase("HTTP-EQUIV").Equals("refresh", StringComparison.InvariantCultureIgnoreCase)); if (meta != null) { var attr = meta.Attributes.GetIgnoreCase("content"); var match = parseMetaRefreshRegex.Match(attr); if (!match.Success) { return(webPage); } var seconds = 0; if (match.Groups["seconds"].Success) { seconds = int.Parse(match.Groups["seconds"].Value); } if (!match.Groups["url"].Success) { return(webPage); } var redirect = Unquote(match.Groups["url"].Value); Uri redirectUrl; if (!Uri.TryCreate(redirect, UriKind.RelativeOrAbsolute, out redirectUrl)) { return(webPage); } if (!redirectUrl.IsAbsoluteUri) { var baseUrl = string.Format("{0}://{1}", url.Scheme, url.Host); if (!url.IsDefaultPort) { baseUrl += ":" + url.Port; } if (redirect.StartsWith("/")) { redirectUrl = baseUrl.CombineUrl(redirect); } else { var path = string.Join("/", url.Segments.Take(url.Segments.Length - 1).Skip(1)); redirectUrl = baseUrl.CombineUrl(path).Combine(redirect); } } Thread.Sleep(TimeSpan.FromSeconds(seconds)); return(DownloadRedirect(redirectUrl, iteration + 1)); } } return(webPage); }