Esempio n. 1
0
        public async Task <Incident> LoadAsync()
        {
            try
            {
                var handler = new HttpClientHandler
                {
                    AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip
                };
                var client = new HttpClient(handler);

                // setting the default user agent
                if (client.DefaultRequestHeaders.UserAgent.Count == 0)
                {
                    client.DefaultRequestHeaders.UserAgent.ParseAdd("Azure Function");
                }

                var httpResponseMessage = await client.GetAsync(_uri);

                httpResponseMessage.EnsureSuccessStatusCode();

                // There is a bug in the .net framework that causes ReadAsStringAsync() to fail if the server reports the content encoding as "utf-8" rather than utf-8 https://github.com/dotnet/corefx/issues/5014
                if (httpResponseMessage.Content.Headers.ContentType?.CharSet == @"""utf-8""")
                {
                    httpResponseMessage.Content.Headers.ContentType.CharSet = "UTF-8";
                }

                var data = await httpResponseMessage.Content.ReadAsStringAsync();

                data = StringSanitizer.SimplifyHtmlEncoded(data);

                var sr = new SmartReader.Reader(_uri, data);
                sr.AddCustomOperationStart(SpaceElements);
                var article = sr.GetArticle();
                var content = !string.IsNullOrEmpty(article.TextContent) ? article.TextContent : article.Excerpt;

                if (!string.IsNullOrEmpty(content))
                {
                    var shortSummary =
                        StringSanitizer.RemoveDoublespaces(
                            StringSanitizer.RemoveUrls(
                                StringSanitizer.RemoveHashtags(content)))
                        .Trim();

                    var summary =
                        StringSanitizer.RemoveDoublespaces(content)
                        .Trim();
                    return(new Incident(shortSummary, summary));
                }
            }
            catch (Exception e)
            {
                _logger.LogError(e, "Exception loading article");
            }

            return(null);
        }
        // Clean up for the ShortSummary
        // The short summary is the shortened version of the summary that is optimized to be processed by LUIS
        private string ShortSummaryCleanUp(string body)
        {
            string result = StringSanitizer.RemoveHtmlTags(body);

            result = StringSanitizer.SimplifyHtmlEncoded(result);
            result = StringSanitizer.RemoveFillerWords(result);
            result = StringSanitizer.RemoveSpecialCharacters(result);
            result = StringSanitizer.RemoveDoublespaces(result);
            return(result.Trim());
        }
        // Clean up for the summary
        // The summary is the human readable content that is displayed in the ESRI portal
        private string SummaryCleanUp(string body)
        {
            string result = StringSanitizer.RemoveHtmlTags(body);

            result = WebUtility.HtmlDecode(result);
            result = StringSanitizer.RemoveHashtags(result);
            result = StringSanitizer.RemoveDoublespaces(result);
            result = result.Trim();
            return(result);
        }
        public static async Task <LuisInput> Run([QueueTrigger("twitter")] string json, ILogger log)
        {
            log.LogInformation($"Scrape function invoked:\n{json}");

            var tweet = JsonConvert.DeserializeObject <TweetModel>(json);

            string summary = null, shortSummary = null;
            var    sourceUrl = await SourceUrlFromTweetAsync(tweet, log);

            if (!string.IsNullOrEmpty(sourceUrl))
            {
                log.LogInformation($"Loading external reference into scraper '{sourceUrl}'.");

                var incident = await GetIncidentFromUrl(log, sourceUrl);

                if (incident != null)
                {
                    shortSummary = incident.ShortSummary;
                    summary      = incident.Summary;
                }
                else
                {
                    log.LogWarning($"Reference failed to load content from '{sourceUrl}'.");
                }
            }
            else
            {
                log.LogInformation("No source url was available for this input, skipping scrape.");
                string text = StringSanitizer.SimplifyPunctuation(
                    System.Net.WebUtility.HtmlDecode(tweet.TweetText));

                summary      = StringSanitizer.RemoveDoublespaces(text);
                shortSummary =
                    StringSanitizer.RemoveDoublespaces(
                        StringSanitizer.RemoveUrls(
                            StringSanitizer.RemoveHashtags(text)));
            }

            return(new LuisInput
            {
                SourceUrl = sourceUrl,
                TwitterProfileUrl = $"https://twitter.com/{tweet.TweetedBy}",
                TweetUrl = $"https://twitter.com/{tweet.TweetedBy}/status/{tweet.TweetId}",
                UserLocation = tweet.UserDetails?.Location,
                ShortSummary = shortSummary,
                Summary = summary
            });
        }