Example #1
0
        public WebPage(ScrapingBrowser browser, Uri absoluteUrl, bool autoDownloadPagesResources, RawRequest rawRequest, RawResponse rawResponse,
                       Encoding encoding, bool autoDetectCharsetEncoding)
        {
            this.browser     = browser;
            this.absoluteUrl = absoluteUrl;
            this.rawRequest  = rawRequest;
            this.rawResponse = rawResponse;
            this.autoDetectCharsetEncoding = autoDetectCharsetEncoding;
            Encoding = encoding;

            content   = Encoding.GetString(rawResponse.Body);
            resources = new List <WebResource>();

            LoadHtml();

            if (autoDownloadPagesResources)
            {
                LoadBaseUrl();
                DownloadResources();
            }
        }
Example #2
0
        private WebPage GetResponse(Uri url, HttpWebRequest request, int iteration, byte[] requestBody)
        {
            string content;
            var    response       = GetWebResponse(url, request);
            var    responseStream = response.GetResponseStream();
            var    headers        = request.Headers.AllKeys.Select(k => new KeyValuePair <string, string>(k, request.Headers[k])).ToList();

            if (responseStream == null)
            {
                return(new WebPage(this, url, AutoDownloadPagesResources,
                                   new RawRequest(request.Method, request.RequestUri, request.ProtocolVersion, headers, requestBody, Encoding),
                                   new RawResponse(response.ProtocolVersion, response.StatusCode, response.StatusDescription, response.Headers, new byte[0], Encoding), Encoding, AutoDetectCharsetEncoding));
            }

            var body = new MemoryStream();

            responseStream.CopyTo(body);
            responseStream.Close();

            body.Position = 0;

            //using (var reader = new StreamReader(responseStream))
            //{
            //    content = reader.ReadToEnd();
            //}

            content = Encoding.GetString(body.ToArray());

            body.Position = 0;

            var rawRequest = new RawRequest(request.Method, request.RequestUri, request.ProtocolVersion, headers, requestBody, Encoding);
            var webPage    = new WebPage(this, url, AutoDownloadPagesResources, rawRequest,
                                         new RawResponse(response.ProtocolVersion, response.StatusCode, response.StatusDescription, response.Headers, body.ToArray(), Encoding), Encoding, AutoDetectCharsetEncoding);

            if (AllowMetaRedirect && !string.IsNullOrEmpty(response.ContentType) && response.ContentType.Contains("html") && iteration < 10)
            {
                var html = content.ToHtmlNode();
                var meta = html.CssSelect("meta")
                           .FirstOrDefault(p => p.Attributes != null && p.Attributes.HasKeyIgnoreCase("HTTP-EQUIV") &&
                                           p.Attributes.GetIgnoreCase("HTTP-EQUIV").Equals("refresh", StringComparison.InvariantCultureIgnoreCase));

                if (meta != null)
                {
                    var attr  = meta.Attributes.GetIgnoreCase("content");
                    var match = parseMetaRefreshRegex.Match(attr);
                    if (!match.Success)
                    {
                        return(webPage);
                    }

                    var seconds = 0;
                    if (match.Groups["seconds"].Success)
                    {
                        seconds = int.Parse(match.Groups["seconds"].Value);
                    }
                    if (!match.Groups["url"].Success)
                    {
                        return(webPage);
                    }

                    var redirect = Unquote(match.Groups["url"].Value);

                    Uri redirectUrl;
                    if (!Uri.TryCreate(redirect, UriKind.RelativeOrAbsolute, out redirectUrl))
                    {
                        return(webPage);
                    }

                    if (!redirectUrl.IsAbsoluteUri)
                    {
                        var baseUrl = string.Format("{0}://{1}", url.Scheme, url.Host);
                        if (!url.IsDefaultPort)
                        {
                            baseUrl += ":" + url.Port;
                        }

                        if (redirect.StartsWith("/"))
                        {
                            redirectUrl = baseUrl.CombineUrl(redirect);
                        }
                        else
                        {
                            var path = string.Join("/", url.Segments.Take(url.Segments.Length - 1).Skip(1));
                            redirectUrl = baseUrl.CombineUrl(path).Combine(redirect);
                        }
                    }

                    Thread.Sleep(TimeSpan.FromSeconds(seconds));

                    return(DownloadRedirect(redirectUrl, iteration + 1));
                }
            }

            return(webPage);
        }