Exemple #1
0
        public string ExtractBodyTextFromArticleDocument(HtmlDocument articleHtmlDocument)
        {
            RemoveHeadersFromDocument(articleHtmlDocument);
            RemoveLinksFromDocument(articleHtmlDocument);
            RemoveUnorderedListsFromDocument(articleHtmlDocument);
            RemoveScriptsFromDocument(articleHtmlDocument);
            if (articleHtmlDocument?.DocumentNode?.OuterHtml == null)
            {
                return(String.Empty);
            }
            var cleanedHtml          = articleHtmlDocument.DocumentNode.OuterHtml;
            var htmlToTextConversion = _spider.HtmlToTextAsync(cleanedHtml);

            Task.WaitAll(htmlToTextConversion);

            if (htmlToTextConversion.IsCompletedSuccessfully)
            {
                var articleText      = htmlToTextConversion.Result.Replace("\n", " ");
                var finalArticleText = RemoveNonBodyTextSentences(articleText);
                return(finalArticleText);
            }
            else
            {
                throw new Exception($"could not convert the following html to text {cleanedHtml}");
            }
        }