Esempio n. 1
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException("uri");
            }

            CrawledPage crawledPage = new CrawledPage(uri);

            HttpWebRequest  request  = null;
            HttpWebResponse response = null;

            try
            {
                request  = BuildRequestObject(uri);
                response = (HttpWebResponse)request.GetResponse();
                ProcessResponseObject(response);
            }
            catch (WebException e)
            {
                crawledPage.WebException = e;

                if (e.Response != null)
                {
                    response = (HttpWebResponse)e.Response;
                }

                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            catch (Exception e)
            {
                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            finally
            {
                crawledPage.HttpWebRequest = request;

                if (response != null)
                {
                    crawledPage.HttpWebResponse = response;
                    CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                    if (shouldDownloadContentDecision.Allow)
                    {
                        crawledPage.Content = _extractor.GetContent(response);
                    }
                    else
                    {
                        _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
                    }

                    response.Close();//Should already be closed by _extractor but just being safe
                }
            }

            return(crawledPage);
        }
Esempio n. 2
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException(nameof(uri));
            }

            // Create page for crawling
            CrawledPage crawledPage = new CrawledPage(uri);

            // Sending request and getting response
            HttpWebRequest  request  = null;
            HttpWebResponse response = null;

            try
            {
                request = BuildRequestObject(uri);

                crawledPage.RequestStarted = DateTime.Now;
                response = await request.GetResponseAsync() as HttpWebResponse;

                ProcessResponseObject(response);
            }
            catch (WebException e)
            {
                crawledPage.WebException = e;

                if (e.Response != null)
                {
                    response = (HttpWebResponse)e.Response;
                }

                Logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                Logger.Debug(e);
            }
            catch (Exception e)
            {
                Logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                Logger.Debug(e);
            }
            finally
            {
                try
                {
                    crawledPage.HttpWebRequest   = request;
                    crawledPage.RequestCompleted = DateTime.Now;

                    if (response != null)
                    {
                        crawledPage.HttpWebResponse = new HttpWebResponseWrapper(response);

                        CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                        if (shouldDownloadContentDecision.Allow)
                        {
                            crawledPage.DownloadContentStarted = DateTime.Now;
                            // Collect useful info from page
                            crawledPage.Content = Extractor.GetContent(response);
                            crawledPage.DownloadContentCompleted = DateTime.Now;
                        }
                        else
                        {
                            Logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
                        }

                        // Should already be closed by _extractor but just being safe
                        response.Close();
                    }
                }
                catch (Exception e)
                {
                    Logger.DebugFormat("Error occurred finalizing requesting url [{0}]", uri.AbsoluteUri);
                    Logger.Debug(e);
                }
            }

            return(crawledPage);
        }
Esempio n. 3
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException(nameof(uri));
            }

            var crawledPage = new CrawledPage(uri);
            HttpResponseMessage httpResponseMessage = null;

            try
            {
                crawledPage.RequestStarted = DateTime.Now;
                httpResponseMessage        = await _httpClient.GetAsync(uri);
            }
            //catch (HttpRequestException hre)
            //{
            //http://stackoverflow.com/questions/22382373/httprequestexception-vs-webexception
            //}
            //catch (IOException ioe)
            //{

            //}
            catch (WebException e)
            {
                crawledPage.WebException = e;
                _logger.LogDebug($"Error occurred requesting url [{uri.AbsoluteUri}]", e);
            }
            catch (Exception e)
            {
                _logger.LogDebug($"Error occurred requesting url [{uri.AbsoluteUri}]", e);
            }
            finally
            {
                try
                {
                    crawledPage.HttpResponseMessage = httpResponseMessage;
                    crawledPage.RequestCompleted    = DateTime.Now;
                    if (httpResponseMessage != null)
                    {
                        crawledPage.HttpWebResponse = new HttpWebResponseWrapper(httpResponseMessage);
                        var shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                        if (shouldDownloadContentDecision.Allow)
                        {
                            crawledPage.DownloadContentStarted = DateTime.Now;
                            crawledPage.Content = await _extractor.GetContent(httpResponseMessage);

                            crawledPage.DownloadContentCompleted = DateTime.Now;
                        }
                        else
                        {
                            _logger.LogDebug($"Links on page [{crawledPage.Uri.AbsoluteUri}] not crawled, [{shouldDownloadContentDecision.Reason}]");
                        }

                        //response.Close();//Should already be closed by _extractor but just being safe
                    }
                }
                catch (Exception e)
                {
                    _logger.LogDebug($"Error occurred finalizing requesting url [{uri.AbsoluteUri}]", e);
                }
            }

            return(crawledPage);
        }
Esempio n. 4
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException("uri");
            }

            CrawledPage crawledPage = new CrawledPage(uri);

            HttpWebRequest  request  = null;
            HttpWebResponse response = null;

            try
            {
                request = BuildRequestObject(uri);
                crawledPage.RequestStarted = DateTime.Now;
                response = (HttpWebResponse)request.GetResponse();
                ProcessResponseObject(response);
            }
            catch (WebException e)
            {
                // Try to patch redirect error
                bool fixedRedirectError = false;
                if (e.Response != null)
                {
                    string candidateUri = null;
                    Uri    fixedUri     = null;

                    int statusCode = e.Response is HttpWebResponse ?
                                     (int)((HttpWebResponse)e.Response).StatusCode : 0;
                    if (statusCode >= 300 && statusCode <= 308)
                    {
                        string location = e.Response.Headers != null?
                                          e.Response.Headers.Get("Location") : null;

                        if (location != null)
                        {
                            candidateUri = location;
                        }
                    }
                    if (candidateUri == null && e.Response.ResponseUri != null)
                    {
                        candidateUri = e.Response.ResponseUri.AbsoluteUri;
                    }
                    if (candidateUri != null)
                    {
                        if (candidateUri.Contains("#"))
                        {
                            candidateUri = candidateUri.Substring(0, candidateUri.IndexOf('#'));
                        }
                        else if (candidateUri.EndsWith("%20"))
                        {
                            candidateUri = e.Response.ResponseUri.ToString().Trim();
                        }

                        if (Uri.IsWellFormedUriString(candidateUri, UriKind.Absolute))
                        {
                            fixedUri = new Uri(candidateUri);
                        }
                        else if (e.Response.ResponseUri != null && Uri.IsWellFormedUriString(candidateUri, UriKind.Relative))
                        {
                            fixedUri = new Uri(e.Response.ResponseUri, candidateUri);
                        }
                    }
                    if (fixedUri != null)
                    {
                        try
                        {
                            request = BuildRequestObject(fixedUri);
                            crawledPage.RequestStarted = DateTime.Now;
                            response = (HttpWebResponse)request.GetResponse();
                            ProcessResponseObject(response);
                            fixedRedirectError = true;
                        }
                        catch { /* Failed again => stay on the first error */ }
                    }
                }

                if (!fixedRedirectError)
                {
                    crawledPage.WebException = e;

                    if (e.Response != null)
                    {
                        response = (HttpWebResponse)e.Response;

                        // Check for protections against bot scraping
                        if (response.Headers.Get("X-DataDome") != null)
                        {
                            Console.WriteLine("This website is protected against bots crawling by https://datadome.co/fr/ : crawl aborted to comply with this policy.");
                        }
                    }

                    _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                    _logger.Debug(e);
                }
            }
            catch (Exception e)
            {
                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            finally
            {
                try
                {
                    crawledPage.HttpWebRequest   = request;
                    crawledPage.RequestCompleted = DateTime.Now;
                    if (response != null)
                    {
                        crawledPage.HttpWebResponse = new HttpWebResponseWrapper(response);
                        CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                        if (shouldDownloadContentDecision.Allow)
                        {
                            crawledPage.DownloadContentStarted = DateTime.Now;
                            crawledPage.Content = _extractor.GetContent(response);
                            crawledPage.DownloadContentCompleted = DateTime.Now;
                        }
                        else
                        {
                            _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
                        }

                        response.Close();//Should already be closed by _extractor but just being safe
                    }
                }
                catch (Exception e)
                {
                    _logger.DebugFormat("Error occurred finalizing requesting url [{0}]", uri.AbsoluteUri);
                    _logger.Debug(e);
                }
            }

            return(crawledPage);
        }