Esempio n. 1
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException(nameof(uri));
            }

            var crawledPage                         = new CrawledPage(uri);
            HttpRequestMessage  request             = null;
            HttpResponseMessage httpResponseMessage = null;

            try
            {
                request = BuildRequestObject(uri);
                crawledPage.RequestStarted = DateTime.Now;
                httpResponseMessage        = await _httpClient.SendAsync(request);
            }
            catch (Exception e)
            {
                _logger.LogDebug($"Error occurred requesting url [{uri.AbsoluteUri}]", e);
            }
            finally
            {
                try
                {
                    crawledPage.HttpRequestMessage = request;
                    crawledPage.RequestCompleted   = DateTime.Now;
                    if (httpResponseMessage != null)
                    {
                        crawledPage.HttpWebResponse = new HttpWebResponseWrapper(httpResponseMessage, _cookieContainer);
                        var shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                        if (shouldDownloadContentDecision.Allow)
                        {
                            crawledPage.DownloadContentStarted = DateTime.Now;
                            crawledPage.Content = await _extractor.GetContentAsync(httpResponseMessage);

                            crawledPage.DownloadContentCompleted = DateTime.Now;
                        }
                        else
                        {
                            _logger.LogDebug($"Links on page [{crawledPage.Uri.AbsoluteUri}] not crawled, [{shouldDownloadContentDecision.Reason}]");
                        }

                        httpResponseMessage.EnsureSuccessStatusCode();
                        httpResponseMessage.Dispose();//Should already be closed by _extractor but just being safe
                    }
                }
                catch (HttpRequestException e)
                {
                    crawledPage.HttpRequestException = e;
                    _logger.LogDebug($"Error occurred finalizing requesting url [{uri.AbsoluteUri}]", e);
                }
                catch (Exception e)
                {
                    _logger.LogDebug($"Error occurred finalizing requesting url [{uri.AbsoluteUri}]", e);
                }
            }

            return(crawledPage);
        }
Esempio n. 2
0
        public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException("uri");
            }

            CrawledPage         crawledPage = new CrawledPage(uri);
            HttpResponseMessage response    = null;

            try
            {
                crawledPage.RequestStarted = DateTime.Now;
                HttpRequestMessage request = BuildRequestObject(uri);
                response = _client.SendAsync(request).Result;
            }
            catch (TaskCanceledException e)
            {
                Error(crawledPage, e);
            }
            catch (Exception e)
            {
                Error(crawledPage, e);
            }

            try
            {
                crawledPage.RequestCompleted = DateTime.Now;
                crawledPage.StatusCode       = response.StatusCode;
                if (response != null)
                {
                    CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                    if (shouldDownloadContentDecision.Allow)
                    {
                        crawledPage.DownloadContentStarted = DateTime.Now;
                        crawledPage.Content = _webContentExtractor.GetContentAsync(response).Result;
                        crawledPage.DownloadContentCompleted = DateTime.Now;
                    }
                    else
                    {
                        _logger.LogWarning("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
                    }

                    response.Dispose();
                }
            }
            catch (Exception e)
            {
                Error(crawledPage, e);
            }

            return(crawledPage);
        }
Esempio n. 3
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException(nameof(uri));
            }

            if (_httpClient == null)
            {
                _httpClientHandler = BuildHttpClientHandler(uri);
                _httpClient        = BuildHttpClient(_httpClientHandler);
            }

            var crawledPage = new CrawledPage(uri);
            HttpResponseMessage response = null;

            try
            {
                crawledPage.RequestStarted = DateTime.Now;
                using (var requestMessage = BuildHttpRequestMessage(uri))
                {
                    response = await _httpClient.SendAsync(requestMessage, CancellationToken.None).ConfigureAwait(false);
                }

                var statusCode = Convert.ToInt32(response.StatusCode);
                if (statusCode < 200 || statusCode > 399)
                {
                    throw new HttpRequestException($"Server response was unsuccessful, returned [http {statusCode}]");
                }
            }
            catch (HttpRequestException hre)
            {
                crawledPage.HttpRequestException = hre;
                Log.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, hre);
            }
            catch (TaskCanceledException ex)
            {
                crawledPage.HttpRequestException = new HttpRequestException("Request timeout occurred", ex);//https://stackoverflow.com/questions/10547895/how-can-i-tell-when-httpclient-has-timed-out
                Log.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, crawledPage.HttpRequestException);
            }
            catch (Exception e)
            {
                crawledPage.HttpRequestException = new HttpRequestException("Unknown error occurred", e);
                Log.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, crawledPage.HttpRequestException);
            }
            finally
            {
                crawledPage.HttpRequestMessage  = response?.RequestMessage;
                crawledPage.RequestCompleted    = DateTime.Now;
                crawledPage.HttpResponseMessage = response;
                crawledPage.HttpClientHandler   = _httpClientHandler;

                try
                {
                    if (response != null)
                    {
                        var shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                        if (shouldDownloadContentDecision.Allow)
                        {
                            crawledPage.DownloadContentStarted = DateTime.Now;
                            crawledPage.Content = await _contentExtractor.GetContentAsync(response).ConfigureAwait(false);

                            if (_config.IsFollowMetaRedirectsEnabled)
                            {
                                crawledPage.MetaRedirectURL = _contentExtractor.GetMetaRedirectUrl(crawledPage);
                            }
                            crawledPage.DownloadContentCompleted = DateTime.Now;
                        }
                        else
                        {
                            Log.Debug("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
                        }
                    }
                }
                catch (Exception e)
                {
                    Log.Debug("Error occurred finalizing requesting url [{0}] {@Exception}", uri.AbsoluteUri, e);
                }
            }

            return(crawledPage);
        }
Esempio n. 4
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException(nameof(uri));
            }

            if (_httpClient == null)
            {
                _httpClientHandler = BuildHttpClientHandler(uri);
                _httpClient        = BuildHttpClient(_httpClientHandler);
            }

            var crawledPage = new CrawledPage(uri);
            HttpResponseMessage response = null;

            try
            {
                crawledPage.RequestStarted = DateTime.Now;
                using (var requestMessage = BuildHttpRequestMessage(uri))
                {
                    response = await _httpClient.SendAsync(requestMessage, CancellationToken.None).ConfigureAwait(false);
                }
            }
            catch (HttpRequestException hre)
            {
                crawledPage.HttpRequestException = hre;

                Log.Logger.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, hre);
            }
            catch (Exception e)
            {
                Log.Logger.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, e);
            }
            finally
            {
                crawledPage.HttpRequestMessage  = response?.RequestMessage;
                crawledPage.RequestCompleted    = DateTime.Now;
                crawledPage.HttpResponseMessage = response;

                try
                {
                    if (response != null)
                    {
                        var shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                        if (shouldDownloadContentDecision.Allow)
                        {
                            crawledPage.DownloadContentStarted = DateTime.Now;
                            crawledPage.Content = await _contentExtractor.GetContentAsync(response).ConfigureAwait(false);

                            crawledPage.DownloadContentCompleted = DateTime.Now;
                        }
                        else
                        {
                            Log.Logger.Debug("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
                        }
                    }
                }
                catch (Exception e)
                {
                    Log.Logger.Debug("Error occurred finalizing requesting url [{0}] {@Exception}", uri.AbsoluteUri, e);
                }
            }

            return(crawledPage);
        }