internal XElement ExtractArticleTitle(XDocument document)
        {
            var documentBody = GetOrCreateBody(document);
              string documentTitle = document.GetTitle() ?? "";
              string currentTitle = documentTitle;

              if (_ArticleTitleDashRegex1.IsMatch(currentTitle))
              {
            currentTitle = _ArticleTitleDashRegex2.Replace(documentTitle, "$1");

            if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1)
            {
              currentTitle = _ArticleTitleDashRegex3.Replace(documentTitle, "$1");
            }
              }
              else if (currentTitle.IndexOf(": ") != -1)
              {
            currentTitle = _ArticleTitleColonRegex1.Replace(documentTitle, "$1");

            if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1)
            {
              currentTitle = _ArticleTitleColonRegex2.Replace(documentTitle, "$1");
            }
              }
              else if (currentTitle.Length > _MaxArticleTitleLength || currentTitle.Length < _MinArticleTitleLength)
              {
            var levelOneHeaders = documentBody.GetElementsByTagName("h1");

            if (levelOneHeaders.Count() == 1)
            {
              currentTitle = GetInnerText(levelOneHeaders.First());
            }
              }

              currentTitle = (currentTitle ?? "").Trim();

              if (currentTitle.Split(' ').Length <= _MinArticleTitleWordsCount2)
              {
            currentTitle = documentTitle;
              }

              if (string.IsNullOrEmpty(currentTitle))
              {
            return null;
              }

              var articleTitleElement = new XElement("h1");

              articleTitleElement.SetInnerHtml(currentTitle);

              return articleTitleElement;
        }
        internal void PrepareArticleContentElement(XElement articleContentElement)
        {
            CleanStyles(articleContentElement);
              KillBreaks(articleContentElement);

              /* Clean out junk from the article content. */
              Clean(articleContentElement, "form");
              Clean(articleContentElement, "object");
              Clean(articleContentElement, "h1");

              /* If there is only one h2, they are probably using it as a header and not a subheader,
               * so remove it since we already have a header. */
              if (articleContentElement.GetElementsByTagName("h2").Count() == 1)
              {
            Clean(articleContentElement, "h2");
              }

              Clean(articleContentElement, "iframe");
              CleanHeaders(articleContentElement);

              /* Do these last as the previous stuff may have removed junk that will affect these. */
              CleanConditionally(articleContentElement, "table");
              CleanConditionally(articleContentElement, "ul");
              CleanConditionally(articleContentElement, "div");

              /* Remove extra paragraphs. */
              var paraElements = articleContentElement.GetElementsByTagName("p");
              var elementsToRemove = new List<XElement>();

              foreach (var paraElement in paraElements)
              {
            string innerText = GetInnerText(paraElement, false);
            if (innerText.Length > 0) { continue; }

            int imgsCount = paraElement.GetElementsByTagName("img").Count();
            if (imgsCount > 0) { continue; }

            int embedsCount = paraElement.GetElementsByTagName("embed").Count();
            if (embedsCount > 0) { continue; }

            int objectsCount = paraElement.GetElementsByTagName("object").Count();
            if (objectsCount > 0) { continue; }

            // We have a paragraph with empty inner text, with no images, no embeds and no objects.
            // Let's remove it.
            elementsToRemove.Add(paraElement);
              }

              RemoveElements(elementsToRemove);

              /* Remove br's that are directly before paragraphs. */
              articleContentElement.SetInnerHtml(_BreakBeforeParagraphRegex.Replace(articleContentElement.GetInnerHtml(), "<p"));
        }
        internal void StripUnlikelyCandidates(XDocument document)
        {
            if (_dontStripUnlikelys)
              {
            return;
              }

              var rootElement = document.Root;

              new ElementsTraverser(
            element =>
              {
            string elementName = element.Name != null ? (element.Name.LocalName ?? "") : "";

            /* Remove unlikely candidates. */
            string unlikelyMatchString = element.GetClass() + element.GetId();

            if (unlikelyMatchString.Length > 0
             && !"body".Equals(elementName, StringComparison.OrdinalIgnoreCase)
             && !"a".Equals(elementName, StringComparison.OrdinalIgnoreCase)
             && _UnlikelyCandidatesRegex.IsMatch(unlikelyMatchString)
             && !_OkMaybeItsACandidateRegex.IsMatch(unlikelyMatchString))
            {
              var parentElement = element.Parent;

              if (parentElement != null)
              {
                element.Remove();
              }

              // element has been removed - we can go to the next one
              return;
            }

            /* Turn all divs that don't have children block level elements into p's or replace text nodes within the div with p's. */
            if ("div".Equals(elementName, StringComparison.OrdinalIgnoreCase))
            {
              if (!_DivToPElementsRegex.IsMatch(element.GetInnerHtml()))
              {
                // no block elements inside - change to p
                element.Name = "p";
              }
              else
              {
                // replace text nodes with p's (experimental)
                new ChildNodesTraverser(
                  childNode =>
                    {
                      if (childNode.NodeType != XmlNodeType.Text
                       || GetInnerText(childNode).Length == 0)
                      {
                        return;
                      }

                      var paraElement = new XElement("p");

                      // note that we're not using GetInnerText() here; instead we're getting raw InnerText to preserve whitespaces
                      paraElement.SetInnerHtml(((XText)childNode).Value);

                      paraElement.SetClass(ReadabilityStyledCssClass);
                      paraElement.SetStyle("display: inline;");

                      childNode.ReplaceWith(paraElement);
                    }
                  ).Traverse(element);
              }
            }
              }).Traverse(rootElement);
        }
        internal void GlueDocument(XDocument document, XElement articleTitleElement, XElement articleContentElement)
        {
            var documentBody = GetOrCreateBody(document);

              /* Include readability.css stylesheet. */
              var headElement = document.GetElementsByTagName("head").FirstOrDefault();

              if (headElement == null)
              {
            headElement = new XElement("head");
            documentBody.AddBeforeSelf(headElement);
              }

              var styleElement = new XElement("style");

              styleElement.SetAttributeValue("type", "text/css");

              var readabilityStylesheetStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(_ReadabilityStylesheetResourceName);

              if (readabilityStylesheetStream == null)
              {
            throw new InternalErrorException("Couldn't load the NReadability stylesheet embedded resource.");
              }

              using (var sr = new StreamReader(readabilityStylesheetStream))
              {
            styleElement.SetInnerHtml(sr.ReadToEnd());
              }

              headElement.Add(styleElement);

              /* Apply reading style to body. */
              string readingStyleClass = GetReadingStyleClass(_readingStyle);

              documentBody.SetClass(readingStyleClass);
              documentBody.SetStyle("display: block;");

              /* Create inner div. */
              var innerDiv = new XElement("div");

              innerDiv.SetId(InnerDivId);
              innerDiv.SetClass(GetReadingMarginClass(_readingMargin) + " " + GetReadingSizeClass(_readingSize));

              if (articleTitleElement != null)
              {
            innerDiv.Add(articleTitleElement);
              }

              if (articleContentElement != null)
              {
            innerDiv.Add(articleContentElement);
              }

              /* Create overlay div. */
              var overlayDiv = new XElement("div");

              overlayDiv.SetId(OverlayDivId);
              overlayDiv.SetClass(readingStyleClass);
              overlayDiv.Add(innerDiv);

              /* Clear the old HTML, insert the new content. */
              documentBody.RemoveAll();
              documentBody.Add(overlayDiv);
        }
 /// <summary>
 /// Removes extraneous break tags from a <paramref name="element" />.
 /// </summary>
 internal void KillBreaks(XElement element)
 {
     element.SetInnerHtml(_KillBreaksRegex.Replace(element.GetInnerHtml(), "<br />"));
 }
        /// <summary>
        /// Recursively appends subsequent pages of a multipage article.
        /// </summary>
        /// <param name="document">Compiled document</param>
        /// <param name="url">Url of current page</param>
        private void AppendNextPage(XDocument document, string url)
        {
            _curPageNum++;

              var contentDiv = document.GetElementById("readInner");

              if (_curPageNum > _MaxPages)
              {
            url = "<div style='text-align: center'><a href='" + url + "'>View Next Page</a></div>";
            contentDiv.Add(XDocument.Parse(url));
            return;
              }

              string nextContent = _urlFetcher.Fetch(url);

              if (string.IsNullOrEmpty(nextContent))
              {
            return;
              }

              bool mainContentExtracted;
              string extractedTitle;
              string nextPageLink;
              var nextDocument = _transcoder.TranscodeToXml(nextContent, url, out mainContentExtracted, out extractedTitle, out nextPageLink);
              var nextInner = nextDocument.GetElementById("readInner");
              var header = nextInner.Element("h1");

              if (header != null)
              {
            header.Remove();
              }

              /*
               * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
               * Compare it against all of the the previous document's we've gotten. If the previous
               * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
              */
              var firstP = nextInner.GetElementsByTagName("p").Count() > 0 ? nextInner.GetElementsByTagName("p").First() : null;

              if (firstP != null && firstP.GetInnerHtml().Length > 100)
              {
            //string innerHtml = firstP.GetInnerHtml();
            //var existingContent = contentDiv.GetInnerHtml();
            //existingContent = Regex.Replace(existingContent, "xmlns(:[a-z]+)?=['\"][^'\"]+['\"]", "", RegexOptions.IgnoreCase);
            //existingContent = Regex.Replace(existingContent, @"\s+", "");
            //innerHtml = Regex.Replace(innerHtml, @"\s+", "");

            // TODO: This test could probably be improved to compare the actual markup.
            string existingContent = contentDiv.Value;
            string innerHtml = firstP.Value;

            if (!string.IsNullOrEmpty(existingContent) && !string.IsNullOrEmpty(innerHtml) && existingContent.IndexOf(innerHtml) != -1)
            {
              _parsedPages.Add(url);
              return;
            }
              }

              /* Add the content to the existing html */
              var nextDiv = new XElement("div");

              if (_pageSeparatorBuilder != null)
              {
            nextDiv.SetInnerHtml(_pageSeparatorBuilder(_curPageNum));
              }

              nextDiv.SetId(_PageIdPrefix + _curPageNum);
              nextDiv.SetClass("page");
              nextDiv.Add(nextInner.Nodes());
              contentDiv.Add(nextDiv);
              _parsedPages.Add(url);

              /* Only continue if we haven't already seen the next page page */
              if (!string.IsNullOrEmpty(nextPageLink) && !_parsedPages.Contains(nextPageLink))
              {
            AppendNextPage(document, nextPageLink);
              }
        }
        internal XElement ExtractArticleTitle(XDocument document)
        {
            XElement documentBody = GetOrCreateBody(document);
              string documentTitle = document.GetTitle() ?? "";
              string currentTitle = documentTitle;

              if (_ArticleTitleDashRegex1.IsMatch(currentTitle))
              {
            currentTitle = _ArticleTitleDashRegex2.Replace(documentTitle, "$1");

            if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1)
            {
              currentTitle = _ArticleTitleDashRegex3.Replace(documentTitle, "$1");
            }
              }
              else if (currentTitle.IndexOf(": ") != -1)
              {
            currentTitle = _ArticleTitleColonRegex1.Replace(documentTitle, "$1");

            if (currentTitle.Split(' ').Length < _MinArticleTitleWordsCount1)
            {
              currentTitle = _ArticleTitleColonRegex2.Replace(documentTitle, "$1");
            }
              }
              else if (currentTitle.Length > _MaxArticleTitleLength || currentTitle.Length < _MinArticleTitleLength)
              {
            List<XElement> titleHeaders = documentBody.GetElementsByTagName("h1").ToList();

            if (titleHeaders.Count == 0)
            {
              // if we don't have any level one headers let's give level two header a chance
              titleHeaders = documentBody.GetElementsByTagName("h2").ToList();
            }

            if (titleHeaders.Count == 1)
            {
              currentTitle = GetInnerText(titleHeaders[0]);
            }
              }

              currentTitle = (currentTitle ?? "").Trim();

              if (!string.IsNullOrEmpty(documentTitle)
               && currentTitle.Split(' ').Length <= _MinArticleTitleWordsCount2)
              {
            currentTitle = documentTitle;
              }

              if (string.IsNullOrEmpty(currentTitle))
              {
            return null;
              }

              var articleTitleElement = new XElement("h1");

              articleTitleElement.SetInnerHtml(currentTitle);

              return articleTitleElement;
        }
        public void Test_SetInnerHtml_text_multiline()
        {
            const string innerHtml = "\r\ntext1\r\ntext\r\n";
              var element = new XElement("div");

              element.SetInnerHtml(innerHtml);

              Assert.AreEqual(innerHtml, element.GetInnerHtml());
        }
        public void Test_SetInnerHtml_text()
        {
            const string innerHtml = "text";
              var element = new XElement("div");

              element.SetInnerHtml(innerHtml);

              Assert.AreEqual(innerHtml, element.GetInnerHtml());
        }
        public void Test_SetInnerHtml_html_with_entity_raquo()
        {
            const string innerHtml = "&raquo;";
              var element = new XElement("div");

              element.SetInnerHtml(innerHtml);

              Assert.IsTrue(element.GetInnerHtml().Contains("»"));
        }
        internal void GlueDocument(XDocument document, XElement articleTitleElement, XElement articleContentElement)
        {
            XElement documentBody = GetOrCreateBody(document);

              /* Include readability.css stylesheet. */
              XElement headElement = document.GetElementsByTagName("head").FirstOrDefault();

              if (headElement == null)
              {
            headElement = new XElement("head");
            documentBody.AddBeforeSelf(headElement);
              }

              XElement styleElement = new XElement("style");

              styleElement.SetAttributeValue("type", "text/css");

              Stream readabilityStylesheetStream = typeof(NReadabilityTranscoder).GetTypeInfo().Assembly.GetManifestResourceStream(_ReadabilityStylesheetResourceName);

              if (readabilityStylesheetStream == null)
              {
            throw new InternalErrorException("Couldn't load the NReadability stylesheet embedded resource.");
              }

              using (var sr = new StreamReader(readabilityStylesheetStream))
              {
            styleElement.SetInnerHtml(sr.ReadToEnd());
              }

              headElement.Add(styleElement);

              /* Apply reading style to body. */
              string readingStyleClass = GetReadingStyleClass(_readingStyle);

              documentBody.SetClass(readingStyleClass);
              documentBody.SetStyle("display: block;");
              documentBody.RemoveAll();

              if (articleTitleElement != null)
              {
            documentBody.Add(articleTitleElement);
              }

              if (articleContentElement != null)
              {
            documentBody.Add(articleContentElement);
              }
        }
Example #12
0
using System;