/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException("uri"); } CrawledPage crawledPage = new CrawledPage(uri); HttpWebRequest request = null; HttpWebResponse response = null; try { request = BuildRequestObject(uri); response = (HttpWebResponse)request.GetResponse(); ProcessResponseObject(response); } catch (WebException e) { crawledPage.WebException = e; if (e.Response != null) { response = (HttpWebResponse)e.Response; } _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } catch (Exception e) { _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } finally { crawledPage.HttpWebRequest = request; if (response != null) { crawledPage.HttpWebResponse = response; CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.Content = _extractor.GetContent(response); } else { _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } response.Close();//Should already be closed by _extractor but just being safe } } return(crawledPage); }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException(nameof(uri)); } // Create page for crawling CrawledPage crawledPage = new CrawledPage(uri); // Sending request and getting response HttpWebRequest request = null; HttpWebResponse response = null; try { request = BuildRequestObject(uri); crawledPage.RequestStarted = DateTime.Now; response = await request.GetResponseAsync() as HttpWebResponse; ProcessResponseObject(response); } catch (WebException e) { crawledPage.WebException = e; if (e.Response != null) { response = (HttpWebResponse)e.Response; } Logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); Logger.Debug(e); } catch (Exception e) { Logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); Logger.Debug(e); } finally { try { crawledPage.HttpWebRequest = request; crawledPage.RequestCompleted = DateTime.Now; if (response != null) { crawledPage.HttpWebResponse = new HttpWebResponseWrapper(response); CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.DownloadContentStarted = DateTime.Now; // Collect useful info from page crawledPage.Content = Extractor.GetContent(response); crawledPage.DownloadContentCompleted = DateTime.Now; } else { Logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } // Should already be closed by _extractor but just being safe response.Close(); } } catch (Exception e) { Logger.DebugFormat("Error occurred finalizing requesting url [{0}]", uri.AbsoluteUri); Logger.Debug(e); } } return(crawledPage); }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual async Task <CrawledPage> MakeRequestAsync(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException(nameof(uri)); } var crawledPage = new CrawledPage(uri); HttpResponseMessage httpResponseMessage = null; try { crawledPage.RequestStarted = DateTime.Now; httpResponseMessage = await _httpClient.GetAsync(uri); } //catch (HttpRequestException hre) //{ //http://stackoverflow.com/questions/22382373/httprequestexception-vs-webexception //} //catch (IOException ioe) //{ //} catch (WebException e) { crawledPage.WebException = e; _logger.LogDebug($"Error occurred requesting url [{uri.AbsoluteUri}]", e); } catch (Exception e) { _logger.LogDebug($"Error occurred requesting url [{uri.AbsoluteUri}]", e); } finally { try { crawledPage.HttpResponseMessage = httpResponseMessage; crawledPage.RequestCompleted = DateTime.Now; if (httpResponseMessage != null) { crawledPage.HttpWebResponse = new HttpWebResponseWrapper(httpResponseMessage); var shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.DownloadContentStarted = DateTime.Now; crawledPage.Content = await _extractor.GetContent(httpResponseMessage); crawledPage.DownloadContentCompleted = DateTime.Now; } else { _logger.LogDebug($"Links on page [{crawledPage.Uri.AbsoluteUri}] not crawled, [{shouldDownloadContentDecision.Reason}]"); } //response.Close();//Should already be closed by _extractor but just being safe } } catch (Exception e) { _logger.LogDebug($"Error occurred finalizing requesting url [{uri.AbsoluteUri}]", e); } } return(crawledPage); }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) { throw new ArgumentNullException("uri"); } CrawledPage crawledPage = new CrawledPage(uri); HttpWebRequest request = null; HttpWebResponse response = null; try { request = BuildRequestObject(uri); crawledPage.RequestStarted = DateTime.Now; response = (HttpWebResponse)request.GetResponse(); ProcessResponseObject(response); } catch (WebException e) { // Try to patch redirect error bool fixedRedirectError = false; if (e.Response != null) { string candidateUri = null; Uri fixedUri = null; int statusCode = e.Response is HttpWebResponse ? (int)((HttpWebResponse)e.Response).StatusCode : 0; if (statusCode >= 300 && statusCode <= 308) { string location = e.Response.Headers != null? e.Response.Headers.Get("Location") : null; if (location != null) { candidateUri = location; } } if (candidateUri == null && e.Response.ResponseUri != null) { candidateUri = e.Response.ResponseUri.AbsoluteUri; } if (candidateUri != null) { if (candidateUri.Contains("#")) { candidateUri = candidateUri.Substring(0, candidateUri.IndexOf('#')); } else if (candidateUri.EndsWith("%20")) { candidateUri = e.Response.ResponseUri.ToString().Trim(); } if (Uri.IsWellFormedUriString(candidateUri, UriKind.Absolute)) { fixedUri = new Uri(candidateUri); } else if (e.Response.ResponseUri != null && Uri.IsWellFormedUriString(candidateUri, UriKind.Relative)) { fixedUri = new Uri(e.Response.ResponseUri, candidateUri); } } if (fixedUri != null) { try { request = BuildRequestObject(fixedUri); crawledPage.RequestStarted = DateTime.Now; response = (HttpWebResponse)request.GetResponse(); ProcessResponseObject(response); fixedRedirectError = true; } catch { /* Failed again => stay on the first error */ } } } if (!fixedRedirectError) { crawledPage.WebException = e; if (e.Response != null) { response = (HttpWebResponse)e.Response; // Check for protections against bot scraping if (response.Headers.Get("X-DataDome") != null) { Console.WriteLine("This website is protected against bots crawling by https://datadome.co/fr/ : crawl aborted to comply with this policy."); } } _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } } catch (Exception e) { _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } finally { try { crawledPage.HttpWebRequest = request; crawledPage.RequestCompleted = DateTime.Now; if (response != null) { crawledPage.HttpWebResponse = new HttpWebResponseWrapper(response); CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.DownloadContentStarted = DateTime.Now; crawledPage.Content = _extractor.GetContent(response); crawledPage.DownloadContentCompleted = DateTime.Now; } else { _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } response.Close();//Should already be closed by _extractor but just being safe } } catch (Exception e) { _logger.DebugFormat("Error occurred finalizing requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } } return(crawledPage); }