public async Task <Incident> LoadAsync() { try { var handler = new HttpClientHandler { AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip }; var client = new HttpClient(handler); // setting the default user agent if (client.DefaultRequestHeaders.UserAgent.Count == 0) { client.DefaultRequestHeaders.UserAgent.ParseAdd("Azure Function"); } var httpResponseMessage = await client.GetAsync(_uri); httpResponseMessage.EnsureSuccessStatusCode(); // There is a bug in the .net framework that causes ReadAsStringAsync() to fail if the server reports the content encoding as "utf-8" rather than utf-8 https://github.com/dotnet/corefx/issues/5014 if (httpResponseMessage.Content.Headers.ContentType?.CharSet == @"""utf-8""") { httpResponseMessage.Content.Headers.ContentType.CharSet = "UTF-8"; } var data = await httpResponseMessage.Content.ReadAsStringAsync(); data = StringSanitizer.SimplifyHtmlEncoded(data); var sr = new SmartReader.Reader(_uri, data); sr.AddCustomOperationStart(SpaceElements); var article = sr.GetArticle(); var content = !string.IsNullOrEmpty(article.TextContent) ? article.TextContent : article.Excerpt; if (!string.IsNullOrEmpty(content)) { var shortSummary = StringSanitizer.RemoveDoublespaces( StringSanitizer.RemoveUrls( StringSanitizer.RemoveHashtags(content))) .Trim(); var summary = StringSanitizer.RemoveDoublespaces(content) .Trim(); return(new Incident(shortSummary, summary)); } } catch (Exception e) { _logger.LogError(e, "Exception loading article"); } return(null); }
// Clean up for the ShortSummary // The short summary is the shortened version of the summary that is optimized to be processed by LUIS private string ShortSummaryCleanUp(string body) { string result = StringSanitizer.RemoveHtmlTags(body); result = StringSanitizer.SimplifyHtmlEncoded(result); result = StringSanitizer.RemoveFillerWords(result); result = StringSanitizer.RemoveSpecialCharacters(result); result = StringSanitizer.RemoveDoublespaces(result); return(result.Trim()); }
// Clean up for the summary // The summary is the human readable content that is displayed in the ESRI portal private string SummaryCleanUp(string body) { string result = StringSanitizer.RemoveHtmlTags(body); result = WebUtility.HtmlDecode(result); result = StringSanitizer.RemoveHashtags(result); result = StringSanitizer.RemoveDoublespaces(result); result = result.Trim(); return(result); }
public static async Task <LuisInput> Run([QueueTrigger("twitter")] string json, ILogger log) { log.LogInformation($"Scrape function invoked:\n{json}"); var tweet = JsonConvert.DeserializeObject <TweetModel>(json); string summary = null, shortSummary = null; var sourceUrl = await SourceUrlFromTweetAsync(tweet, log); if (!string.IsNullOrEmpty(sourceUrl)) { log.LogInformation($"Loading external reference into scraper '{sourceUrl}'."); var incident = await GetIncidentFromUrl(log, sourceUrl); if (incident != null) { shortSummary = incident.ShortSummary; summary = incident.Summary; } else { log.LogWarning($"Reference failed to load content from '{sourceUrl}'."); } } else { log.LogInformation("No source url was available for this input, skipping scrape."); string text = StringSanitizer.SimplifyPunctuation( System.Net.WebUtility.HtmlDecode(tweet.TweetText)); summary = StringSanitizer.RemoveDoublespaces(text); shortSummary = StringSanitizer.RemoveDoublespaces( StringSanitizer.RemoveUrls( StringSanitizer.RemoveHashtags(text))); } return(new LuisInput { SourceUrl = sourceUrl, TwitterProfileUrl = $"https://twitter.com/{tweet.TweetedBy}", TweetUrl = $"https://twitter.com/{tweet.TweetedBy}/status/{tweet.TweetId}", UserLocation = tweet.UserDetails?.Location, ShortSummary = shortSummary, Summary = summary }); }