internal XElement ExtractArticleTitle(XDocument document) { var documentBody = GetOrCreateBody(document); string documentTitle = document.GetTitle() ?? ""; string currentTitle = documentTitle; if (_ArticleTitleDashRegex1.IsMatch(currentTitle)) { currentTitle = _ArticleTitleDashRegex2.Replace(documentTitle, "$1"); if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1) { currentTitle = _ArticleTitleDashRegex3.Replace(documentTitle, "$1"); } } else if (currentTitle.IndexOf(": ") != -1) { currentTitle = _ArticleTitleColonRegex1.Replace(documentTitle, "$1"); if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1) { currentTitle = _ArticleTitleColonRegex2.Replace(documentTitle, "$1"); } } else if (currentTitle.Length > _MaxArticleTitleLength || currentTitle.Length < _MinArticleTitleLength) { var levelOneHeaders = documentBody.GetElementsByTagName("h1"); if (levelOneHeaders.Count() == 1) { currentTitle = GetInnerText(levelOneHeaders.First()); } } currentTitle = (currentTitle ?? "").Trim(); if (currentTitle.Split(' ').Length <= _MinArticleTitleWordsCount2) { currentTitle = documentTitle; } if (string.IsNullOrEmpty(currentTitle)) { return null; } var articleTitleElement = new XElement("h1"); articleTitleElement.SetInnerHtml(currentTitle); return articleTitleElement; }
internal void PrepareArticleContentElement(XElement articleContentElement) { CleanStyles(articleContentElement); KillBreaks(articleContentElement); /* Clean out junk from the article content. */ Clean(articleContentElement, "form"); Clean(articleContentElement, "object"); Clean(articleContentElement, "h1"); /* If there is only one h2, they are probably using it as a header and not a subheader, * so remove it since we already have a header. */ if (articleContentElement.GetElementsByTagName("h2").Count() == 1) { Clean(articleContentElement, "h2"); } Clean(articleContentElement, "iframe"); CleanHeaders(articleContentElement); /* Do these last as the previous stuff may have removed junk that will affect these. */ CleanConditionally(articleContentElement, "table"); CleanConditionally(articleContentElement, "ul"); CleanConditionally(articleContentElement, "div"); /* Remove extra paragraphs. */ var paraElements = articleContentElement.GetElementsByTagName("p"); var elementsToRemove = new List<XElement>(); foreach (var paraElement in paraElements) { string innerText = GetInnerText(paraElement, false); if (innerText.Length > 0) { continue; } int imgsCount = paraElement.GetElementsByTagName("img").Count(); if (imgsCount > 0) { continue; } int embedsCount = paraElement.GetElementsByTagName("embed").Count(); if (embedsCount > 0) { continue; } int objectsCount = paraElement.GetElementsByTagName("object").Count(); if (objectsCount > 0) { continue; } // We have a paragraph with empty inner text, with no images, no embeds and no objects. // Let's remove it. elementsToRemove.Add(paraElement); } RemoveElements(elementsToRemove); /* Remove br's that are directly before paragraphs. */ articleContentElement.SetInnerHtml(_BreakBeforeParagraphRegex.Replace(articleContentElement.GetInnerHtml(), "<p")); }
internal void StripUnlikelyCandidates(XDocument document) { if (_dontStripUnlikelys) { return; } var rootElement = document.Root; new ElementsTraverser( element => { string elementName = element.Name != null ? (element.Name.LocalName ?? "") : ""; /* Remove unlikely candidates. */ string unlikelyMatchString = element.GetClass() + element.GetId(); if (unlikelyMatchString.Length > 0 && !"body".Equals(elementName, StringComparison.OrdinalIgnoreCase) && !"a".Equals(elementName, StringComparison.OrdinalIgnoreCase) && _UnlikelyCandidatesRegex.IsMatch(unlikelyMatchString) && !_OkMaybeItsACandidateRegex.IsMatch(unlikelyMatchString)) { var parentElement = element.Parent; if (parentElement != null) { element.Remove(); } // element has been removed - we can go to the next one return; } /* Turn all divs that don't have children block level elements into p's or replace text nodes within the div with p's. */ if ("div".Equals(elementName, StringComparison.OrdinalIgnoreCase)) { if (!_DivToPElementsRegex.IsMatch(element.GetInnerHtml())) { // no block elements inside - change to p element.Name = "p"; } else { // replace text nodes with p's (experimental) new ChildNodesTraverser( childNode => { if (childNode.NodeType != XmlNodeType.Text || GetInnerText(childNode).Length == 0) { return; } var paraElement = new XElement("p"); // note that we're not using GetInnerText() here; instead we're getting raw InnerText to preserve whitespaces paraElement.SetInnerHtml(((XText)childNode).Value); paraElement.SetClass(ReadabilityStyledCssClass); paraElement.SetStyle("display: inline;"); childNode.ReplaceWith(paraElement); } ).Traverse(element); } } }).Traverse(rootElement); }
internal void GlueDocument(XDocument document, XElement articleTitleElement, XElement articleContentElement) { var documentBody = GetOrCreateBody(document); /* Include readability.css stylesheet. */ var headElement = document.GetElementsByTagName("head").FirstOrDefault(); if (headElement == null) { headElement = new XElement("head"); documentBody.AddBeforeSelf(headElement); } var styleElement = new XElement("style"); styleElement.SetAttributeValue("type", "text/css"); var readabilityStylesheetStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(_ReadabilityStylesheetResourceName); if (readabilityStylesheetStream == null) { throw new InternalErrorException("Couldn't load the NReadability stylesheet embedded resource."); } using (var sr = new StreamReader(readabilityStylesheetStream)) { styleElement.SetInnerHtml(sr.ReadToEnd()); } headElement.Add(styleElement); /* Apply reading style to body. */ string readingStyleClass = GetReadingStyleClass(_readingStyle); documentBody.SetClass(readingStyleClass); documentBody.SetStyle("display: block;"); /* Create inner div. */ var innerDiv = new XElement("div"); innerDiv.SetId(InnerDivId); innerDiv.SetClass(GetReadingMarginClass(_readingMargin) + " " + GetReadingSizeClass(_readingSize)); if (articleTitleElement != null) { innerDiv.Add(articleTitleElement); } if (articleContentElement != null) { innerDiv.Add(articleContentElement); } /* Create overlay div. */ var overlayDiv = new XElement("div"); overlayDiv.SetId(OverlayDivId); overlayDiv.SetClass(readingStyleClass); overlayDiv.Add(innerDiv); /* Clear the old HTML, insert the new content. */ documentBody.RemoveAll(); documentBody.Add(overlayDiv); }
/// <summary> /// Removes extraneous break tags from a <paramref name="element" />. /// </summary> internal void KillBreaks(XElement element) { element.SetInnerHtml(_KillBreaksRegex.Replace(element.GetInnerHtml(), "<br />")); }
/// <summary> /// Recursively appends subsequent pages of a multipage article. /// </summary> /// <param name="document">Compiled document</param> /// <param name="url">Url of current page</param> private void AppendNextPage(XDocument document, string url) { _curPageNum++; var contentDiv = document.GetElementById("readInner"); if (_curPageNum > _MaxPages) { url = "<div style='text-align: center'><a href='" + url + "'>View Next Page</a></div>"; contentDiv.Add(XDocument.Parse(url)); return; } string nextContent = _urlFetcher.Fetch(url); if (string.IsNullOrEmpty(nextContent)) { return; } bool mainContentExtracted; string extractedTitle; string nextPageLink; var nextDocument = _transcoder.TranscodeToXml(nextContent, url, out mainContentExtracted, out extractedTitle, out nextPageLink); var nextInner = nextDocument.GetElementById("readInner"); var header = nextInner.Element("h1"); if (header != null) { header.Remove(); } /* * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. * Compare it against all of the the previous document's we've gotten. If the previous * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. */ var firstP = nextInner.GetElementsByTagName("p").Count() > 0 ? nextInner.GetElementsByTagName("p").First() : null; if (firstP != null && firstP.GetInnerHtml().Length > 100) { //string innerHtml = firstP.GetInnerHtml(); //var existingContent = contentDiv.GetInnerHtml(); //existingContent = Regex.Replace(existingContent, "xmlns(:[a-z]+)?=['\"][^'\"]+['\"]", "", RegexOptions.IgnoreCase); //existingContent = Regex.Replace(existingContent, @"\s+", ""); //innerHtml = Regex.Replace(innerHtml, @"\s+", ""); // TODO: This test could probably be improved to compare the actual markup. string existingContent = contentDiv.Value; string innerHtml = firstP.Value; if (!string.IsNullOrEmpty(existingContent) && !string.IsNullOrEmpty(innerHtml) && existingContent.IndexOf(innerHtml) != -1) { _parsedPages.Add(url); return; } } /* Add the content to the existing html */ var nextDiv = new XElement("div"); if (_pageSeparatorBuilder != null) { nextDiv.SetInnerHtml(_pageSeparatorBuilder(_curPageNum)); } nextDiv.SetId(_PageIdPrefix + _curPageNum); nextDiv.SetClass("page"); nextDiv.Add(nextInner.Nodes()); contentDiv.Add(nextDiv); _parsedPages.Add(url); /* Only continue if we haven't already seen the next page page */ if (!string.IsNullOrEmpty(nextPageLink) && !_parsedPages.Contains(nextPageLink)) { AppendNextPage(document, nextPageLink); } }
internal XElement ExtractArticleTitle(XDocument document) { XElement documentBody = GetOrCreateBody(document); string documentTitle = document.GetTitle() ?? ""; string currentTitle = documentTitle; if (_ArticleTitleDashRegex1.IsMatch(currentTitle)) { currentTitle = _ArticleTitleDashRegex2.Replace(documentTitle, "$1"); if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1) { currentTitle = _ArticleTitleDashRegex3.Replace(documentTitle, "$1"); } } else if (currentTitle.IndexOf(": ") != -1) { currentTitle = _ArticleTitleColonRegex1.Replace(documentTitle, "$1"); if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1) { currentTitle = _ArticleTitleColonRegex2.Replace(documentTitle, "$1"); } } else if (currentTitle.Length > _MaxArticleTitleLength || currentTitle.Length < _MinArticleTitleLength) { List<XElement> titleHeaders = documentBody.GetElementsByTagName("h1").ToList(); if (titleHeaders.Count == 0) { // if we don't have any level one headers let's give level two header a chance titleHeaders = documentBody.GetElementsByTagName("h2").ToList(); } if (titleHeaders.Count == 1) { currentTitle = GetInnerText(titleHeaders[0]); } } currentTitle = (currentTitle ?? "").Trim(); if (!string.IsNullOrEmpty(documentTitle) && currentTitle.Split(' ').Length <= _MinArticleTitleWordsCount2) { currentTitle = documentTitle; } if (string.IsNullOrEmpty(currentTitle)) { return null; } var articleTitleElement = new XElement("h1"); articleTitleElement.SetInnerHtml(currentTitle); return articleTitleElement; }
public void Test_SetInnerHtml_text_multiline() { const string innerHtml = "\r\ntext1\r\ntext\r\n"; var element = new XElement("div"); element.SetInnerHtml(innerHtml); Assert.AreEqual(innerHtml, element.GetInnerHtml()); }
public void Test_SetInnerHtml_text() { const string innerHtml = "text"; var element = new XElement("div"); element.SetInnerHtml(innerHtml); Assert.AreEqual(innerHtml, element.GetInnerHtml()); }
public void Test_SetInnerHtml_html_with_entity_raquo() { const string innerHtml = "»"; var element = new XElement("div"); element.SetInnerHtml(innerHtml); Assert.IsTrue(element.GetInnerHtml().Contains("»")); }
internal void GlueDocument(XDocument document, XElement articleTitleElement, XElement articleContentElement) { XElement documentBody = GetOrCreateBody(document); /* Include readability.css stylesheet. */ XElement headElement = document.GetElementsByTagName("head").FirstOrDefault(); if (headElement == null) { headElement = new XElement("head"); documentBody.AddBeforeSelf(headElement); } XElement styleElement = new XElement("style"); styleElement.SetAttributeValue("type", "text/css"); Stream readabilityStylesheetStream = typeof(NReadabilityTranscoder).GetTypeInfo().Assembly.GetManifestResourceStream(_ReadabilityStylesheetResourceName); if (readabilityStylesheetStream == null) { throw new InternalErrorException("Couldn't load the NReadability stylesheet embedded resource."); } using (var sr = new StreamReader(readabilityStylesheetStream)) { styleElement.SetInnerHtml(sr.ReadToEnd()); } headElement.Add(styleElement); /* Apply reading style to body. */ string readingStyleClass = GetReadingStyleClass(_readingStyle); documentBody.SetClass(readingStyleClass); documentBody.SetStyle("display: block;"); documentBody.RemoveAll(); if (articleTitleElement != null) { documentBody.Add(articleTitleElement); } if (articleContentElement != null) { documentBody.Add(articleContentElement); } }
using System;