public string ExtractBodyTextFromArticleDocument(HtmlDocument articleHtmlDocument) { RemoveHeadersFromDocument(articleHtmlDocument); RemoveLinksFromDocument(articleHtmlDocument); RemoveUnorderedListsFromDocument(articleHtmlDocument); RemoveScriptsFromDocument(articleHtmlDocument); if (articleHtmlDocument?.DocumentNode?.OuterHtml == null) { return(String.Empty); } var cleanedHtml = articleHtmlDocument.DocumentNode.OuterHtml; var htmlToTextConversion = _spider.HtmlToTextAsync(cleanedHtml); Task.WaitAll(htmlToTextConversion); if (htmlToTextConversion.IsCompletedSuccessfully) { var articleText = htmlToTextConversion.Result.Replace("\n", " "); var finalArticleText = RemoveNonBodyTextSentences(articleText); return(finalArticleText); } else { throw new Exception($"could not convert the following html to text {cleanedHtml}"); } }