private async Task <HttpFetchResult> TryFetchWithHttp2Client(Uri url, params string[] acceptHeaders) { try { using var http2Request = new HttpRequestMessage(HttpMethod.Get, url) { Version = new Version(2, 0) }; http2Request.AddAcceptHeaders(acceptHeaders); using var result = await http.SendAsync(http2Request); result.EnsureSuccessStatusCode(); var contentString = await result.Content.ReadAsStringAsync(); logger.LogInformation("Successfully fetched {url} via HTTP/2 fallback", url); return(new HttpFetchResult { Result = contentString }); } catch (Exception http2Error) { logger.LogWarning(http2Error, "Unable to fetch {url} using HTTP/2 fallback.", url); return(new HttpFetchResult { Error = http2Error }); } }
/// <summary> /// Sets the user agent to exclude CURL, which some sites (e.g. Facebook) require for programmatic fetching to work. /// </summary> /// <param name="url">The URL to fetch</param> /// <param name="acceptHeaders">Additional headers to set on the fetch request.</param> /// <returns>A fetch result containing the fetch response.</returns> private async Task <HttpFetchResult> TryFetchWithCurlUserAgent(Uri url, string[] acceptHeaders) { try { // Append CURL to the user agent. http.SetUserAgent(userAgent + " curl/7.64.1"); using var httpRequest = new HttpRequestMessage(HttpMethod.Get, url); httpRequest.AddAcceptHeaders(acceptHeaders); var httpResponse = await http.SendAsync(httpRequest); httpResponse.EnsureSuccessStatusCode(); var content = await httpResponse.Content.ReadAsStringAsync(); return(new HttpFetchResult { Result = content }); } catch (Exception fetchError) { logger.LogWarning(fetchError, "Unable to fetch {url} using CURL user agent fallback.", url); return(new HttpFetchResult { Error = fetchError }); } finally { // Reset the user agent back to the default user agent. http.SetUserAgent(userAgent); } }
/// <summary> /// Attempts to fetch a resource at the specified URL. /// If the fetch fails, it will attempt to fetch using HTTP/2. /// Failures due to encoding errors will also attempt fetch using UTF-8 encoding as a fallback. /// If all fetches fail, the result will contain the exception. /// </summary> /// <param name="url"></param> /// <param name="acceptHeaders"></param> /// <returns></returns> private async Task <HttpFetchResult> TryFetch(Uri url, params string[] acceptHeaders) { try { using var httpRequest = new HttpRequestMessage(HttpMethod.Get, url); httpRequest.AddAcceptHeaders(acceptHeaders); var httpResponse = await http.SendAsync(httpRequest); // If it's a 403, we have special handling for this. if (httpResponse.StatusCode == System.Net.HttpStatusCode.Forbidden) { var errorMessage = !string.IsNullOrWhiteSpace(httpResponse.ReasonPhrase) ? httpResponse.ReasonPhrase : "Web server's response was 403 Forbidden."; throw new HttpForbiddenException(errorMessage); } httpResponse.EnsureSuccessStatusCode(); var content = await httpResponse.Content.ReadAsStringAsync(); return(new HttpFetchResult { Result = content }); } catch (InvalidOperationException invalidOpError) when(invalidOpError.Message.Contains("The character set provided in ContentType is invalid.")) { // Invalid encoding? Sometimes webpages have incorrectly set their charset / content type. // See if we can just parse the thing using UTF-8. logger.LogWarning(invalidOpError, "Unable to parse using HTTP client due to invalid ContentType. Attempting to parse using UTF-8."); return(await TryFetchWithForcedUtf8(url, acceptHeaders)); } catch (HttpForbiddenException forbiddenError) { logger.LogWarning(forbiddenError, "Received 403 Forbidden when fetching {url}. Attempting fetch with CURL user agent fallback."); return(await TryFetchWithCurlUserAgent(url, acceptHeaders)); // TODO: should we always try this when we encounter exception, not just forbidden exception? } catch (Exception httpException) { logger.LogWarning(httpException, "Failed to fetch {url}. Falling back to HTTP/2 fetch.", url); return(await TryFetchWithHttp2Client(url, acceptHeaders)); } }