public async Task <Incident> LoadAsync() { try { var handler = new HttpClientHandler { AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip }; var client = new HttpClient(handler); // setting the default user agent if (client.DefaultRequestHeaders.UserAgent.Count == 0) { client.DefaultRequestHeaders.UserAgent.ParseAdd("Azure Function"); } var httpResponseMessage = await client.GetAsync(_uri); httpResponseMessage.EnsureSuccessStatusCode(); // There is a bug in the .net framework that causes ReadAsStringAsync() to fail if the server reports the content encoding as "utf-8" rather than utf-8 https://github.com/dotnet/corefx/issues/5014 if (httpResponseMessage.Content.Headers.ContentType?.CharSet == @"""utf-8""") { httpResponseMessage.Content.Headers.ContentType.CharSet = "UTF-8"; } var data = await httpResponseMessage.Content.ReadAsStringAsync(); data = StringSanitizer.SimplifyHtmlEncoded(data); var sr = new SmartReader.Reader(_uri, data); sr.AddCustomOperationStart(SpaceElements); var article = sr.GetArticle(); var content = !string.IsNullOrEmpty(article.TextContent) ? article.TextContent : article.Excerpt; if (!string.IsNullOrEmpty(content)) { var shortSummary = StringSanitizer.RemoveDoublespaces( StringSanitizer.RemoveUrls( StringSanitizer.RemoveHashtags(content))) .Trim(); var summary = StringSanitizer.RemoveDoublespaces(content) .Trim(); return(new Incident(shortSummary, summary)); } } catch (Exception e) { _logger.LogError(e, "Exception loading article"); } return(null); }
// Clean up for the ShortSummary // The short summary is the shortened version of the summary that is optimized to be processed by LUIS private string ShortSummaryCleanUp(string body) { string result = StringSanitizer.RemoveHtmlTags(body); result = StringSanitizer.SimplifyHtmlEncoded(result); result = StringSanitizer.RemoveFillerWords(result); result = StringSanitizer.RemoveSpecialCharacters(result); result = StringSanitizer.RemoveDoublespaces(result); return(result.Trim()); }