internal IEnumerable<XElement> FindCandidatesForArticleContent(XDocument document) { var paraElements = document.GetElementsByTagName("p"); var candidateElements = new HashSet<XElement>(); _elementsScores.Clear(); foreach (var paraElement in paraElements) { string innerText = GetInnerText(paraElement); if (innerText.Length < _MinParagraphLength) { continue; } var parentElement = paraElement.Parent; var grandParentElement = parentElement != null ? parentElement.Parent : null; int score = 1; // 1 point for having a paragraph // Add points for any comma-segments within this paragraph. score += GetSegmentsCount(innerText, ','); // For every PARAGRAPH_SEGMENT_LENGTH characters in this paragraph, add another point. Up to MAX_POINTS_FOR_SEGMENTS_COUNT points. score += Math.Min(innerText.Length / _ParagraphSegmentLength, _MaxPointsForSegmentsCount); // Add the score to the parent. if (parentElement != null && (parentElement.Name == null || !"html".Equals(parentElement.Name.LocalName, StringComparison.OrdinalIgnoreCase))) { candidateElements.Add(parentElement); AddPointsToElementScore(parentElement, score); } // Add half the score to the grandparent. if (grandParentElement != null && (grandParentElement.Name == null || !"html".Equals(grandParentElement.Name.LocalName, StringComparison.OrdinalIgnoreCase))) { candidateElements.Add(grandParentElement); AddPointsToElementScore(grandParentElement, score / 2); } } return candidateElements; }
private static void ResolveElementsUrls(XDocument document, string tagName, string attributeName, string url, Func<AttributeTransformationInput, AttributeTransformationResult> attributeValueTransformer) { if (document == null) { throw new ArgumentNullException("document"); } if (string.IsNullOrEmpty(url)) { throw new ArgumentNullException("url"); } var elements = document.GetElementsByTagName(tagName); foreach (var element in elements) { var attributeValue = element.GetAttributeValue(attributeName, null); if (attributeValue == null) { continue; } attributeValue = ResolveElementUrl(attributeValue, url); if (!string.IsNullOrEmpty(attributeValue)) { AttributeTransformationResult attributeTransformationResult; if (attributeValueTransformer != null) { attributeTransformationResult = attributeValueTransformer.Invoke(new AttributeTransformationInput { AttributeValue = attributeValue, Element = element }); } else { attributeTransformationResult = new AttributeTransformationResult { TransformedValue = attributeValue }; } element.SetAttributeValue(attributeName, attributeTransformationResult.TransformedValue); if (!string.IsNullOrEmpty(attributeTransformationResult.OriginalValueAttributeName)) { element.SetAttributeValue(attributeTransformationResult.OriginalValueAttributeName, attributeValue); } } } }
internal void GlueDocument(XDocument document, XElement articleTitleElement, XElement articleContentElement) { var documentBody = GetOrCreateBody(document); /* Include readability.css stylesheet. */ var headElement = document.GetElementsByTagName("head").FirstOrDefault(); if (headElement == null) { headElement = new XElement("head"); documentBody.AddBeforeSelf(headElement); } var styleElement = new XElement("style"); styleElement.SetAttributeValue("type", "text/css"); var readabilityStylesheetStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(_ReadabilityStylesheetResourceName); if (readabilityStylesheetStream == null) { throw new InternalErrorException("Couldn't load the NReadability stylesheet embedded resource."); } using (var sr = new StreamReader(readabilityStylesheetStream)) { styleElement.SetInnerHtml(sr.ReadToEnd()); } headElement.Add(styleElement); /* Apply reading style to body. */ string readingStyleClass = GetReadingStyleClass(_readingStyle); documentBody.SetClass(readingStyleClass); documentBody.SetStyle("display: block;"); /* Create inner div. */ var innerDiv = new XElement("div"); innerDiv.SetId(InnerDivId); innerDiv.SetClass(GetReadingMarginClass(_readingMargin) + " " + GetReadingSizeClass(_readingSize)); if (articleTitleElement != null) { innerDiv.Add(articleTitleElement); } if (articleContentElement != null) { innerDiv.Add(articleContentElement); } /* Create overlay div. */ var overlayDiv = new XElement("div"); overlayDiv.SetId(OverlayDivId); overlayDiv.SetClass(readingStyleClass); overlayDiv.Add(innerDiv); /* Clear the old HTML, insert the new content. */ documentBody.RemoveAll(); documentBody.Add(overlayDiv); }
private static XElement TryFindArticleContentElement(XDocument document, string articleContentElementHint) { if (document == null) { throw new ArgumentNullException("document"); } if (string.IsNullOrEmpty(articleContentElementHint)) { throw new ArgumentException("Argument can't be null nor empty.", "articleContentElementHint"); } return document .GetElementsByTagName(articleContentElementHint) .FirstOrDefault(); }
internal void GlueDocument(XDocument document, XElement articleTitleElement, XElement articleContentElement) { XElement documentBody = GetOrCreateBody(document); /* Include readability.css stylesheet. */ XElement headElement = document.GetElementsByTagName("head").FirstOrDefault(); if (headElement == null) { headElement = new XElement("head"); documentBody.AddBeforeSelf(headElement); } XElement styleElement = new XElement("style"); styleElement.SetAttributeValue("type", "text/css"); Stream readabilityStylesheetStream = typeof(NReadabilityTranscoder).GetTypeInfo().Assembly.GetManifestResourceStream(_ReadabilityStylesheetResourceName); if (readabilityStylesheetStream == null) { throw new InternalErrorException("Couldn't load the NReadability stylesheet embedded resource."); } using (var sr = new StreamReader(readabilityStylesheetStream)) { styleElement.SetInnerHtml(sr.ReadToEnd()); } headElement.Add(styleElement); /* Apply reading style to body. */ string readingStyleClass = GetReadingStyleClass(_readingStyle); documentBody.SetClass(readingStyleClass); documentBody.SetStyle("display: block;"); documentBody.RemoveAll(); if (articleTitleElement != null) { documentBody.Add(articleTitleElement); } if (articleContentElement != null) { documentBody.Add(articleContentElement); } }
using System;