/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException(nameof(uri)); } var crawledPage = new CrawledPage(uri); HttpRequestMessage request = null; HttpResponseMessage httpResponseMessage = null; try { request = BuildRequestObject(uri); crawledPage.RequestStarted = DateTime.Now; httpResponseMessage = await _httpClient.SendAsync(request); } catch (Exception e) { _logger.LogDebug($"Error occurred requesting url [{uri.AbsoluteUri}]", e); } finally { try { crawledPage.HttpRequestMessage = request; crawledPage.RequestCompleted = DateTime.Now; if (httpResponseMessage != null) { crawledPage.HttpWebResponse = new HttpWebResponseWrapper(httpResponseMessage, _cookieContainer); var shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.DownloadContentStarted = DateTime.Now; crawledPage.Content = await _extractor.GetContentAsync(httpResponseMessage); crawledPage.DownloadContentCompleted = DateTime.Now; } else { _logger.LogDebug($"Links on page [{crawledPage.Uri.AbsoluteUri}] not crawled, [{shouldDownloadContentDecision.Reason}]"); } httpResponseMessage.EnsureSuccessStatusCode(); httpResponseMessage.Dispose();//Should already be closed by _extractor but just being safe } } catch (HttpRequestException e) { crawledPage.HttpRequestException = e; _logger.LogDebug($"Error occurred finalizing requesting url [{uri.AbsoluteUri}]", e); } catch (Exception e) { _logger.LogDebug($"Error occurred finalizing requesting url [{uri.AbsoluteUri}]", e); } } return(crawledPage); }
public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException("uri"); } CrawledPage crawledPage = new CrawledPage(uri); HttpResponseMessage response = null; try { crawledPage.RequestStarted = DateTime.Now; HttpRequestMessage request = BuildRequestObject(uri); response = _client.SendAsync(request).Result; } catch (TaskCanceledException e) { Error(crawledPage, e); } catch (Exception e) { Error(crawledPage, e); } try { crawledPage.RequestCompleted = DateTime.Now; crawledPage.StatusCode = response.StatusCode; if (response != null) { CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.DownloadContentStarted = DateTime.Now; crawledPage.Content = _webContentExtractor.GetContentAsync(response).Result; crawledPage.DownloadContentCompleted = DateTime.Now; } else { _logger.LogWarning("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } response.Dispose(); } } catch (Exception e) { Error(crawledPage, e); } return(crawledPage); }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException(nameof(uri)); } if (_httpClient == null) { _httpClientHandler = BuildHttpClientHandler(uri); _httpClient = BuildHttpClient(_httpClientHandler); } var crawledPage = new CrawledPage(uri); HttpResponseMessage response = null; try { crawledPage.RequestStarted = DateTime.Now; using (var requestMessage = BuildHttpRequestMessage(uri)) { response = await _httpClient.SendAsync(requestMessage, CancellationToken.None).ConfigureAwait(false); } var statusCode = Convert.ToInt32(response.StatusCode); if (statusCode < 200 || statusCode > 399) { throw new HttpRequestException($"Server response was unsuccessful, returned [http {statusCode}]"); } } catch (HttpRequestException hre) { crawledPage.HttpRequestException = hre; Log.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, hre); } catch (TaskCanceledException ex) { crawledPage.HttpRequestException = new HttpRequestException("Request timeout occurred", ex);//https://stackoverflow.com/questions/10547895/how-can-i-tell-when-httpclient-has-timed-out Log.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, crawledPage.HttpRequestException); } catch (Exception e) { crawledPage.HttpRequestException = new HttpRequestException("Unknown error occurred", e); Log.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, crawledPage.HttpRequestException); } finally { crawledPage.HttpRequestMessage = response?.RequestMessage; crawledPage.RequestCompleted = DateTime.Now; crawledPage.HttpResponseMessage = response; crawledPage.HttpClientHandler = _httpClientHandler; try { if (response != null) { var shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.DownloadContentStarted = DateTime.Now; crawledPage.Content = await _contentExtractor.GetContentAsync(response).ConfigureAwait(false); if (_config.IsFollowMetaRedirectsEnabled) { crawledPage.MetaRedirectURL = _contentExtractor.GetMetaRedirectUrl(crawledPage); } crawledPage.DownloadContentCompleted = DateTime.Now; } else { Log.Debug("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } } } catch (Exception e) { Log.Debug("Error occurred finalizing requesting url [{0}] {@Exception}", uri.AbsoluteUri, e); } } return(crawledPage); }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException(nameof(uri)); } if (_httpClient == null) { _httpClientHandler = BuildHttpClientHandler(uri); _httpClient = BuildHttpClient(_httpClientHandler); } var crawledPage = new CrawledPage(uri); HttpResponseMessage response = null; try { crawledPage.RequestStarted = DateTime.Now; using (var requestMessage = BuildHttpRequestMessage(uri)) { response = await _httpClient.SendAsync(requestMessage, CancellationToken.None).ConfigureAwait(false); } } catch (HttpRequestException hre) { crawledPage.HttpRequestException = hre; Log.Logger.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, hre); } catch (Exception e) { Log.Logger.Debug("Error occurred requesting url [{0}] {@Exception}", uri.AbsoluteUri, e); } finally { crawledPage.HttpRequestMessage = response?.RequestMessage; crawledPage.RequestCompleted = DateTime.Now; crawledPage.HttpResponseMessage = response; try { if (response != null) { var shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.DownloadContentStarted = DateTime.Now; crawledPage.Content = await _contentExtractor.GetContentAsync(response).ConfigureAwait(false); crawledPage.DownloadContentCompleted = DateTime.Now; } else { Log.Logger.Debug("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } } } catch (Exception e) { Log.Logger.Debug("Error occurred finalizing requesting url [{0}] {@Exception}", uri.AbsoluteUri, e); } } return(crawledPage); }