public static void SetInnerHtml(this XElement element, string html) { if (element == null) { throw new ArgumentNullException(nameof(element)); } if (html == null) { throw new ArgumentNullException(nameof(html)); } element.RemoveAll(); var tmpElement = new SgmlDomBuilder().BuildDocument(html); if (tmpElement.Root == null) { return; } foreach (var node in tmpElement.Root.Nodes()) { element.Add(node); } }
/// <summary> /// Initializes a new instance of NReadabilityTranscoder. Allows setting all options. /// </summary> /// <param name="dontStripUnlikelys">Determines whether elements that are unlikely to be a part of main content will be removed.</param> /// <param name="dontNormalizeSpacesInTextContent">Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces).</param> /// <param name="dontWeightClasses">Determines whether 'weight-class' algorithm will be used when cleaning content.</param> /// <param name="readingStyle">Styling for the extracted article.</param> /// <param name="readingMargin">Margin for the extracted article.</param> /// <param name="readingSize">Font size for the extracted article.</param> private NReadabilityTranscoder( bool dontStripUnlikelys, bool dontNormalizeSpacesInTextContent, bool dontWeightClasses, ReadingStyle readingStyle, ReadingMargin readingMargin, ReadingSize readingSize) { _dontStripUnlikelys = dontStripUnlikelys; _dontNormalizeSpacesInTextContent = dontNormalizeSpacesInTextContent; _dontWeightClasses = dontWeightClasses; _readingStyle = readingStyle; _readingMargin = readingMargin; _readingSize = readingSize; _sgmlDomBuilder = new SgmlDomBuilder(); _sgmlDomSerializer = new SgmlDomSerializer(); _elementsScores = new Dictionary<XElement, float>(); }
private static void AssertHtmlContentIsEmpty(string content) { if (content != null) { content = content.Trim(); } var document = new SgmlDomBuilder().BuildDocument(content); int count = 0; foreach (var node in document.DescendantNodes()) { var element = node as XElement; if (element != null) { var name = element.Name.LocalName; if (!"html".Equals(name, StringComparison.OrdinalIgnoreCase) && !"head".Equals(name, StringComparison.OrdinalIgnoreCase) && !"meta".Equals(name, StringComparison.OrdinalIgnoreCase) && !"body".Equals(name, StringComparison.OrdinalIgnoreCase)) { count++; } } } Assert.AreEqual(0, count); }
static NReadabilityTranscoderTests_Old() { _sgmlDomBuilder = new SgmlDomBuilder(); _sgmlDomSerializer = new SgmlDomSerializer(); }
public static void SetInnerHtml(this XElement element, string html) { if (element == null) { throw new ArgumentNullException("element"); } if (html == null) { throw new ArgumentNullException("html"); } element.RemoveAll(); var tmpElement = new SgmlDomBuilder().BuildDocument(html); if (tmpElement.Root == null) { return; } foreach (var node in tmpElement.Root.Nodes()) { element.Add(node); } }
/// <summary> /// Initializes a new instance of NReadabilityTranscoder. Allows setting all options. /// </summary> /// <param name="dontStripUnlikelys">Determines whether elements that are unlikely to be a part of main content will be removed.</param> /// <param name="dontNormalizeSpacesInTextContent">Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces).</param> /// <param name="dontWeightClasses">Determines whether 'weight-class' algorithm will be used when cleaning content.</param> /// <param name="divIdHints">针对特定网站抽取特定div id=的节点作为内容块 </param> private NReadabilityTranscoder( bool dontStripUnlikelys, bool dontNormalizeSpacesInTextContent, bool dontWeightClasses, Dictionary<Regex, string> divIdHints = null) { _dontStripUnlikelys = dontStripUnlikelys; _dontNormalizeSpacesInTextContent = dontNormalizeSpacesInTextContent; _dontWeightClasses = dontWeightClasses; _sgmlDomBuilder = new SgmlDomBuilder(); _sgmlDomSerializer = new SgmlDomSerializer(); _elementsScores = new Dictionary<XElement, float>(); _articleContentDivIdHints2 = divIdHints; }