/// <summary> /// Gets the XML page. /// </summary> /// <param name="url">The URL of the page to load. Cannot be null.</param> /// <param name="shortDescrip">A short description that can be used in status updates. If null, no update will be given.</param> /// <param name="caching">Indicator of whether to query the cache for the requested page.</param> /// <param name="shouldCache">Indicates whether the result of this page load should be cached.</param> /// <param name="suppressNotifications">Indicates whether notification messages should be sent to output.</param> /// <param name="token">Cancellation token.</param> /// <returns>Returns an XML document, if it can be loaded.</returns> public async Task <XDocument> GetXmlPage(string url, string shortDescrip, CachingMode caching, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { XDocument xmldoc = null; string content = await GetPageContent(url, shortDescrip, caching, shouldCache, suppressNotifications, token).ConfigureAwait(false); if (!string.IsNullOrEmpty(content)) { xmldoc = XDocument.Parse(content); } return(xmldoc); }
/// <summary> /// Asynchronously load a specific web page. /// </summary> /// <param name="url">The URL of the page to load. Cannot be null.</param> /// <param name="shortDescrip">A short description that can be used in status updates. If null, no update will be given.</param> /// <param name="caching">Indicator of whether to query the cache for the requested page.</param> /// <param name="shouldCache">Indicates whether the result of this page load should be cached.</param> /// <param name="suppressNotifications">Indicates whether notification messages should be sent to output.</param> /// <param name="token">Cancellation token.</param> /// <returns> /// Returns an HTML document, if it can be loaded. /// </returns> public async Task <HtmlDocument> GetPage(string url, string shortDescrip, CachingMode caching, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { HtmlDocument htmldoc = null; string content = await GetPageContent(url, shortDescrip, caching, shouldCache, suppressNotifications, token).ConfigureAwait(false); if (!string.IsNullOrEmpty(content)) { htmldoc = new HtmlDocument(); await Task.Run(() => htmldoc.LoadHtml(content), token).ConfigureAwait(false); } return(htmldoc); }
/// <summary> /// Gets the XML page. /// </summary> /// <param name="url">The URL of the page to load. Cannot be null.</param> /// <param name="shortDescrip">A short description that can be used in status updates. If null, no update will be given.</param> /// <param name="caching">Indicator of whether to query the cache for the requested page.</param> /// <param name="shouldCache">Indicates whether the result of this page load should be cached.</param> /// <param name="suppressNotifications">Indicates whether notification messages should be sent to output.</param> /// <param name="token">Cancellation token.</param> /// <returns>Returns an XML document, if it can be loaded.</returns> public async Task <XDocument?> GetXmlDocumentAsync(string url, string shortDescrip, CachingMode caching, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { logger.LogInformation($"Requested XML document \"{shortDescrip}\""); XDocument?xmldoc = null; string content = await GetPageContent(url, shortDescrip, caching, shouldCache, suppressNotifications, token).ConfigureAwait(false); if (!string.IsNullOrEmpty(content)) { logger.LogInformation($"\"{shortDescrip}\" successfully loaded."); xmldoc = XDocument.Parse(content); logger.LogDebug($"\"{shortDescrip}\" successfully parsed into XDocument."); } return(xmldoc); }
/// <summary> /// Gets the content of the requested page. /// </summary> /// <param name="url">The URL to load.</param> /// <param name="shortDescrip">The short description of the page (for notifications).</param> /// <param name="caching">The caching mode.</param> /// <param name="shouldCache">Whether the requested page should be cached.</param> /// <param name="suppressNotifications">Whether to suppress notifications.</param> /// <param name="token">The cancellation token.</param> /// <returns>Returns the loaded resource string.</returns> private async Task <string> GetPageContent(string url, string shortDescrip, CachingMode caching, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { var(uri, url2) = GetVerifiedUrl(url); var(found, content) = GetCachedContent(url2, caching); if (found) { NotifyStatusChange(PageRequestStatusType.LoadedFromCache, url2, shortDescrip, null, suppressNotifications); } else { content = await GetUrlContent(uri, url2, shortDescrip, shouldCache, suppressNotifications, token).ConfigureAwait(false) ?? string.Empty; } return(content); }
/// <summary> /// Asynchronously load a specific web page. /// </summary> /// <param name="url">The URL of the page to load. Cannot be null.</param> /// <param name="shortDescrip">A short description that can be used in status updates. If null, no update will be given.</param> /// <param name="caching">Indicator of whether to query the cache for the requested page.</param> /// <param name="shouldCache">Indicates whether the result of this page load should be cached.</param> /// <param name="suppressNotifications">Indicates whether notification messages should be sent to output.</param> /// <param name="token">Cancellation token.</param> /// <returns> /// Returns an HTML document, if it can be loaded. /// </returns> public async Task <HtmlDocument?> GetHtmlDocumentAsync(string url, string shortDescrip, CachingMode caching, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { logger.LogInformation($"Requested HTML document \"{shortDescrip}\""); HtmlDocument?htmldoc = null; string content = await GetPageContent(url, shortDescrip, caching, shouldCache, suppressNotifications, token).ConfigureAwait(false); if (!string.IsNullOrEmpty(content)) { logger.LogInformation($"\"{shortDescrip}\" successfully loaded from web."); htmldoc = new HtmlDocument(); await Task.Run(() => htmldoc.LoadHtml(content), token).ConfigureAwait(false); logger.LogDebug($"\"{shortDescrip}\" successfully parsed into HtmlDocument."); } return(htmldoc); }
protected void NotifyStatusChange(PageRequestStatusType status, string url, string shortDescrip, Exception e, SuppressNotifications suppressNotifications) { if (suppressNotifications == SuppressNotifications.Yes) { return; } if (status == PageRequestStatusType.Requested) { NotifyRequest(url); return; } if (status == PageRequestStatusType.Cancelled) { NotifyCancel(); return; } if (status == PageRequestStatusType.Error) { NotifyError(shortDescrip, e); return; } NotifyResult(status, shortDescrip); }
/// <summary> /// Loads the HEAD of the requested URL, and returns the URI from the returned request header. /// </summary> /// <param name="url">The URL to load.</param> /// <param name="shortDescrip">Short description of the page being loaded.</param> /// <param name="suppressNotifications">Whether to suppress notifications.</param> /// <param name="token">Cancellation token.</param> /// <returns>Returns the URI, if the page is loaded. Otherwise null.</returns> private async Task <Uri?> GetRedirectedHeaderRequestUri(string url, string?shortDescrip, SuppressNotifications suppressNotifications, CancellationToken token) { var(uri, url2) = GetVerifiedUrl(url); NotifyStatusChange(PageRequestStatusType.Requested, url, shortDescrip, null, suppressNotifications); // Limit to no more than N parallel requests await ss.WaitAsync(token).ConfigureAwait(false); try { Cookie?cookie = ForumCookies.GetCookie(uri); if (cookie != null) { ClientHandler.CookieContainer.Add(uri, cookie); } string?authorization = ForumAuthentications.GetAuthorization(uri); if (authorization != null) { httpClient.DefaultRequestHeaders.Add("Authorization", authorization); } int tries = 0; do { token.ThrowIfCancellationRequested(); if (tries > 0) { // Delay any additional attempts after the first. await Task.Delay(retryDelay, token).ConfigureAwait(false); // Notify the user if we're re-trying to load the page. NotifyStatusChange(PageRequestStatusType.Retry, url, shortDescrip, null, suppressNotifications); } tries++; try { using HttpRequestMessage request = new HttpRequestMessage(HttpMethod.Head, uri); // As long as we got a response (whether 200 or 404), we can extract what // the server thinks the URL should be. using (HttpResponseMessage response = await httpClient.SendAsync(request, token).ConfigureAwait(false)) { return(response.RequestMessage.RequestUri); } } catch (HttpRequestException e) { NotifyStatusChange(PageRequestStatusType.Error, url, shortDescrip, e, suppressNotifications); throw; } catch (OperationCanceledException) { if (token.IsCancellationRequested) { // user request throw; } else { // timeout via cancellation logger.LogDebug($"Attempt to load {shortDescrip} timed out/self-cancelled (TA). Tries={tries}"); } } catch (TimeoutException) { logger.LogDebug($"Attempt to load {shortDescrip} timed out. Tries={tries}"); } } while (tries < retryLimit); } finally { httpClient.DefaultRequestHeaders.Remove("Authorization"); ss.Release(); } return(null); }
/// <summary> /// Asynchronously load a specific page. /// </summary> /// <param name="url">The URL of the page to load. Cannot be null.</param> /// <param name="shortDescrip">A short description that can be used in status updates. If null, no update will be given.</param> /// <param name="caching">Indicator of whether to query the cache for the requested page.</param> /// <param name="token">Cancellation token.</param> /// <param name="shouldCache">Indicates whether the result of this page load should be cached.</param> /// <returns>Returns an HTML document, if it can be loaded.</returns> /// <exception cref="ArgumentNullException">If url is null or empty.</exception> /// <exception cref="ArgumentException">If url is not a valid absolute url.</exception> private async Task <string?> GetUrlContent(Uri uri, string url, string shortDescrip, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { string? result = null; int tries = 0; DateTime expires = CacheInfo.DefaultExpiration; NotifyStatusChange(PageRequestStatusType.Requested, url, shortDescrip, null, suppressNotifications); // Limit to no more than N parallel requests await ss.WaitAsync(token).ConfigureAwait(false); try { Cookie?cookie = ForumCookies.GetCookie(uri); if (cookie != null) { ClientHandler.CookieContainer.Add(uri, cookie); } string?authorization = ForumAuthentications.GetAuthorization(uri); if (authorization != null && !httpClient.DefaultRequestHeaders.Contains("Authorization")) { httpClient.DefaultRequestHeaders.Add("Authorization", authorization); } Task <HttpResponseMessage>?getResponseTask = null; do { token.ThrowIfCancellationRequested(); if (tries > 0) { // Delay any additional attempts after the first. await Task.Delay(retryDelay, token).ConfigureAwait(false); // Notify the user if we're making another attempt to load the page. NotifyStatusChange(PageRequestStatusType.Retry, url, shortDescrip, null, suppressNotifications); } tries++; try { getResponseTask = httpClient.GetAsync(uri, token).TimeoutAfter(timeout, token); logger.LogDebug($"Get URI {uri} task ID: {getResponseTask.Id}"); using (var response = await getResponseTask.ConfigureAwait(false)) { if (response.IsSuccessStatusCode) { result = await response.Content.ReadAsStringAsync().ConfigureAwait(false); // Get expires value // Cannot get Expires value until we move to .NET Standard 2.0. // If we get a successful result, we're done. break; } else if (PageLoadFailed(response)) { NotifyStatusChange(PageRequestStatusType.Failed, url, GetFailureMessage(response, shortDescrip, url), null, suppressNotifications); return(null); } else if (PageWasMoved(response)) { url = response.Content.Headers.ContentLocation.AbsoluteUri; uri = new Uri(url); } } } catch (OperationCanceledException) { if (token.IsCancellationRequested) { // user request throw; } else { // timeout via cancellation logger.LogDebug($"Attempt to load {shortDescrip} timed out/self-cancelled (TA). Tries={tries}"); } } catch (TimeoutException) { logger.LogDebug($"Attempt to load {shortDescrip} timed out. Tries={tries}"); } catch (HttpRequestException e) { NotifyStatusChange(PageRequestStatusType.Error, url, shortDescrip, e, suppressNotifications); throw; } } while (tries < retryLimit); logger.LogDebug($"Finished getting URI {uri} task ID: {getResponseTask?.Id ?? 0}"); if (result == null && tries >= retryLimit) { httpClient.CancelPendingRequests(); } } catch (OperationCanceledException) { // If it's not a user-requested cancellation, generate a failure message. if (!token.IsCancellationRequested) { NotifyStatusChange(PageRequestStatusType.Failed, url, shortDescrip, null, suppressNotifications); } throw; } finally { ss.Release(); } token.ThrowIfCancellationRequested(); if (result == null) { NotifyStatusChange(PageRequestStatusType.Failed, url, shortDescrip, null, suppressNotifications); return(null); } if (shouldCache == ShouldCache.Yes) { Cache.Add(url, result, expires); } NotifyStatusChange(PageRequestStatusType.Loaded, url, shortDescrip, null, suppressNotifications); return(result); }
/// <summary> /// Loads the HEAD of the requested URL, and returns the response URL value. /// For a site that redirects some queries, this allows you to get the 'real' URL for a given short URL. /// </summary> /// <param name="url">The URL of the page to load. Cannot be null.</param> /// <param name="shortDescrip">A short description that can be used in status updates. If null, no update will be given.</param> /// <param name="caching">Indicator of whether to query the cache for the requested page.</param> /// <param name="shouldCache">Indicates whether the result of this page load should be cached.</param> /// <param name="suppressNotifications">Indicates whether notification messages should be sent to output.</param> /// <param name="token">Cancellation token.</param> /// <returns> /// Returns the URL that the response headers say we requested. /// </returns> /// <exception cref="System.ArgumentNullException">url</exception> /// <exception cref="System.ArgumentException">url</exception> public async Task <string> GetRedirectUrlAsync(string url, string?shortDescrip, CachingMode caching, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { logger.LogInformation($"Requested URL redirect for \"{shortDescrip}\""); Uri?responseUri = await GetRedirectedHeaderRequestUri(url, shortDescrip, suppressNotifications, token); string result = responseUri?.AbsoluteUri ?? string.Empty; if (string.IsNullOrEmpty(result)) { logger.LogDebug("Redirect request failed for \"{shortDescrip}\"."); } else { logger.LogDebug($"Redirect request succeeded. Using {result}"); } return(result); }
/// <summary> /// Loads the HEAD of the requested URL, and returns the response URL value. /// For a site that redirects some queries, this allows you to get the 'real' URL for a given short URL. /// </summary> /// <param name="url">The URL of the page to load. Cannot be null.</param> /// <param name="shortDescrip">A short description that can be used in status updates. If null, no update will be given.</param> /// <param name="caching">Indicator of whether to query the cache for the requested page.</param> /// <param name="shouldCache">Indicates whether the result of this page load should be cached.</param> /// <param name="suppressNotifications">Indicates whether notification messages should be sent to output.</param> /// <param name="token">Cancellation token.</param> /// <returns> /// Returns the URL that the response headers say we requested. /// </returns> /// <exception cref="System.ArgumentNullException">url</exception> /// <exception cref="System.ArgumentException">url</exception> public async Task <string> GetRedirectUrl(string url, string shortDescrip, CachingMode caching, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { Uri responseUri = await GetRedirectedHeaderRequestUri(url, shortDescrip, suppressNotifications, token); return(responseUri?.AbsoluteUri); }
/// <summary> /// Tries to get the cached version of the requested page. /// </summary> /// <param name="url">The URL.</param> /// <param name="shortDescrip">The short descrip.</param> /// <param name="caching">The caching.</param> /// <param name="suppressNotifyMessages">if set to <c>true</c> [suppress notify messages].</param> /// <returns>Returns whether it found the cached document, and the document, if found.</returns> private async Task <Tuple <bool, HtmlDocument> > TryGetCachedPageAsync(string url, string shortDescrip, CachingMode caching, SuppressNotifications suppressNotifications) { HtmlDocument htmldoc = null; if (caching == CachingMode.UseCache) { htmldoc = await Cache.GetAsync(url).ConfigureAwait(false); if (htmldoc != null) { NotifyStatusChange(PageRequestStatusType.LoadedFromCache, url, shortDescrip, null, suppressNotifications); } } Tuple <bool, HtmlDocument> result = new Tuple <bool, HtmlDocument>(htmldoc != null, htmldoc); return(result); }
/// <summary> /// Loads the HEAD of the requested URL, and returns the response URL value. /// For a site that redirects some queries, this allows you to get the 'real' URL for a given short URL. /// </summary> /// <param name="url">The URL of the page to load. Cannot be null.</param> /// <param name="shortDescrip">A short description that can be used in status updates. If null, no update will be given.</param> /// <param name="caching">Indicator of whether to query the cache for the requested page.</param> /// <param name="shouldCache">Indicates whether the result of this page load should be cached.</param> /// <param name="suppressNotifications">Indicates whether notification messages should be sent to output.</param> /// <param name="token">Cancellation token.</param> /// <returns> /// Returns the URL that the response headers say we requested. /// </returns> /// <exception cref="System.ArgumentNullException">url</exception> /// <exception cref="System.ArgumentException">url</exception> public async Task <string> GetHeaderUrl(string url, string shortDescrip, CachingMode caching, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { if (string.IsNullOrEmpty(url)) { throw new ArgumentNullException(nameof(url)); } if (!Uri.IsWellFormedUriString(url, UriKind.Absolute)) { throw new ArgumentException($"Url is not valid: {url}", nameof(url)); } Uri uri = new Uri(url); NotifyStatusChange(PageRequestStatusType.Requested, url, shortDescrip, null, suppressNotifications); // Limit to no more than N parallel requests await ss.WaitAsync(token).ConfigureAwait(false); try { Cookie cookie = ForumCookies.GetCookie(uri); if (cookie != null) { ClientHandler.CookieContainer.Add(uri, cookie); } int tries = 0; HttpResponseMessage response; HttpRequestMessage request = new HttpRequestMessage(HttpMethod.Head, uri); while (tries < retryLimit && token.IsCancellationRequested == false) { if (tries > 0) { // If we have to retry loading the page, give it a short delay. await Task.Delay(TimeSpan.FromSeconds(4)).ConfigureAwait(false); NotifyStatusChange(PageRequestStatusType.Retry, url, shortDescrip, null, suppressNotifications); } tries++; try { // As long as we got a response (whether 200 or 404), we can extract what // the server thinks the URL should be. using (response = await client.SendAsync(request, token).ConfigureAwait(false)) { return(response.RequestMessage.RequestUri.AbsoluteUri); } } catch (HttpRequestException e) { NotifyStatusChange(PageRequestStatusType.Error, url, shortDescrip, e, suppressNotifications); throw; } } } finally { ss.Release(); } NotifyStatusChange(PageRequestStatusType.Loaded, url, shortDescrip, null, suppressNotifications); return(null); }
/// <summary> /// Asynchronously load a specific page. /// </summary> /// <param name="url">The URL of the page to load. Cannot be null.</param> /// <param name="shortDescrip">A short description that can be used in status updates. If null, no update will be given.</param> /// <param name="caching">Indicator of whether to query the cache for the requested page.</param> /// <param name="token">Cancellation token.</param> /// <param name="shouldCache">Indicates whether the result of this page load should be cached.</param> /// <returns>Returns an HTML document, if it can be loaded.</returns> /// <exception cref="ArgumentNullException">If url is null or empty.</exception> /// <exception cref="ArgumentException">If url is not a valid absolute url.</exception> public async Task <HtmlDocument> GetPage(string url, string shortDescrip, CachingMode caching, ShouldCache shouldCache, SuppressNotifications suppressNotifications, CancellationToken token) { if (string.IsNullOrEmpty(url)) { throw new ArgumentNullException(nameof(url)); } if (!Uri.IsWellFormedUriString(url, UriKind.Absolute)) { throw new ArgumentException($"Url is not valid: {url}", nameof(url)); } Uri uri = new Uri(url); url = Uri.UnescapeDataString(url); HtmlDocument htmldoc = null; string result = null; int tries = 0; // Try to load from cache first, if allowed. if (caching == CachingMode.UseCache) { htmldoc = await Cache.GetAsync(url).ConfigureAwait(false); if (htmldoc != null) { NotifyStatusChange(PageRequestStatusType.LoadedFromCache, url, shortDescrip, null, suppressNotifications); return(htmldoc); } } NotifyStatusChange(PageRequestStatusType.Requested, url, shortDescrip, null, suppressNotifications); // Limit to no more than N parallel requests await ss.WaitAsync(token).ConfigureAwait(false); try { Cookie cookie = ForumCookies.GetCookie(uri); if (cookie != null) { ClientHandler.CookieContainer.Add(uri, cookie); } HttpResponseMessage response; Task <HttpResponseMessage> getResponseTask = null; do { token.ThrowIfCancellationRequested(); if (tries > 0) { // Delay any additional attempts after the first. await Task.Delay(retryDelay, token).ConfigureAwait(false); // Notify the user if we're re-trying to load the page. NotifyStatusChange(PageRequestStatusType.Retry, url, shortDescrip, null, suppressNotifications); } tries++; try { getResponseTask = client.GetAsync(uri, token).TimeoutAfter(timeout, token); Debug.WriteLine($"Get URI {uri} task ID: {getResponseTask.Id}"); using (response = await getResponseTask.ConfigureAwait(false)) { if (response.IsSuccessStatusCode) { result = await response.Content.ReadAsStringAsync().ConfigureAwait(false); // If we get a successful result, we're done. break; } else if (PageLoadFailed(response)) { NotifyStatusChange(PageRequestStatusType.Failed, url, GetFailureMessage(response, shortDescrip, url), null, suppressNotifications); return(null); } else if (PageWasMoved(response)) { url = response.Content.Headers.ContentLocation.AbsoluteUri; uri = new Uri(url); } } } catch (OperationCanceledException) { if (token.IsCancellationRequested) { // user request throw; } else { // timeout via cancellation Debug.WriteLine($"Attempt to load {shortDescrip} timed out/self-cancelled (TA). Tries={tries}"); } } catch (TimeoutException) { Debug.WriteLine($"Attempt to load {shortDescrip} timed out. Tries={tries}"); } catch (HttpRequestException e) { NotifyStatusChange(PageRequestStatusType.Error, url, shortDescrip, e, suppressNotifications); throw; } } while (tries < retryLimit); Debug.WriteLine($"Finished getting URI {uri} task ID: {getResponseTask.Id}"); if (result == null && tries >= retryLimit) { client.CancelPendingRequests(); } } catch (OperationCanceledException) { // If it's not a user-requested cancellation, generate a failure message. if (!token.IsCancellationRequested) { NotifyStatusChange(PageRequestStatusType.Failed, url, shortDescrip, null, suppressNotifications); } throw; } finally { ss.Release(); } token.ThrowIfCancellationRequested(); if (result == null) { NotifyStatusChange(PageRequestStatusType.Failed, url, shortDescrip, null, suppressNotifications); return(null); } if (shouldCache == ShouldCache.Yes) { Cache.Add(url, result); } htmldoc = new HtmlDocument(); await Task.Run(() => htmldoc.LoadHtml(result)).ConfigureAwait(false); NotifyStatusChange(PageRequestStatusType.Loaded, url, shortDescrip, null, suppressNotifications); return(htmldoc); }