示例#1
0
        public static void SetInnerHtml(this XElement element, string html)
        {
            if (element == null)
            {
                throw new ArgumentNullException(nameof(element));
            }

            if (html == null)
            {
                throw new ArgumentNullException(nameof(html));
            }

            element.RemoveAll();

            var tmpElement = new SgmlDomBuilder().BuildDocument(html);

            if (tmpElement.Root == null)
            {
                return;
            }

            foreach (var node in tmpElement.Root.Nodes())
            {
                element.Add(node);
            }
        }
        /// <summary>
        /// Initializes a new instance of NReadabilityTranscoder. Allows setting all options.
        /// </summary>
        /// <param name="dontStripUnlikelys">Determines whether elements that are unlikely to be a part of main content will be removed.</param>
        /// <param name="dontNormalizeSpacesInTextContent">Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces).</param>
        /// <param name="dontWeightClasses">Determines whether 'weight-class' algorithm will be used when cleaning content.</param>
        /// <param name="readingStyle">Styling for the extracted article.</param>
        /// <param name="readingMargin">Margin for the extracted article.</param>
        /// <param name="readingSize">Font size for the extracted article.</param>
        private NReadabilityTranscoder(
            bool dontStripUnlikelys,
            bool dontNormalizeSpacesInTextContent,
            bool dontWeightClasses,
            ReadingStyle readingStyle,
            ReadingMargin readingMargin,
            ReadingSize readingSize)
        {
            _dontStripUnlikelys = dontStripUnlikelys;
              _dontNormalizeSpacesInTextContent = dontNormalizeSpacesInTextContent;
              _dontWeightClasses = dontWeightClasses;
              _readingStyle = readingStyle;
              _readingMargin = readingMargin;
              _readingSize = readingSize;

              _sgmlDomBuilder = new SgmlDomBuilder();
              _sgmlDomSerializer = new SgmlDomSerializer();
              _elementsScores = new Dictionary<XElement, float>();
        }
        private static void AssertHtmlContentIsEmpty(string content)
        {
            if (content != null)
              {
            content = content.Trim();
              }

              var document = new SgmlDomBuilder().BuildDocument(content);

              int count = 0;
              foreach (var node in document.DescendantNodes())
              {
              var element = node as XElement;
              if (element != null)
              {
              var name = element.Name.LocalName;
              if (!"html".Equals(name, StringComparison.OrdinalIgnoreCase)
                  && !"head".Equals(name, StringComparison.OrdinalIgnoreCase)
                  && !"meta".Equals(name, StringComparison.OrdinalIgnoreCase)
                  && !"body".Equals(name, StringComparison.OrdinalIgnoreCase))
              {
                  count++;
              }
              }
              }
              Assert.AreEqual(0, count);
        }
 static NReadabilityTranscoderTests_Old()
 {
     _sgmlDomBuilder = new SgmlDomBuilder();
       _sgmlDomSerializer = new SgmlDomSerializer();
 }
示例#5
0
        public static void SetInnerHtml(this XElement element, string html)
        {
            if (element == null)
              {
            throw new ArgumentNullException("element");
              }

              if (html == null)
              {
            throw new ArgumentNullException("html");
              }

              element.RemoveAll();

              var tmpElement = new SgmlDomBuilder().BuildDocument(html);

              if (tmpElement.Root == null)
              {
            return;
              }

              foreach (var node in tmpElement.Root.Nodes())
              {
            element.Add(node);
              }
        }
        /// <summary>
        /// Initializes a new instance of NReadabilityTranscoder. Allows setting all options.
        /// </summary>
        /// <param name="dontStripUnlikelys">Determines whether elements that are unlikely to be a part of main content will be removed.</param>
        /// <param name="dontNormalizeSpacesInTextContent">Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces).</param>
        /// <param name="dontWeightClasses">Determines whether 'weight-class' algorithm will be used when cleaning content.</param>
        /// <param name="divIdHints">针对特定网站抽取特定div id=的节点作为内容块 </param>
        private NReadabilityTranscoder(
            bool dontStripUnlikelys,
            bool dontNormalizeSpacesInTextContent,
            bool dontWeightClasses,
            Dictionary<Regex, string> divIdHints = null)
        {
            _dontStripUnlikelys = dontStripUnlikelys;
            _dontNormalizeSpacesInTextContent = dontNormalizeSpacesInTextContent;
            _dontWeightClasses = dontWeightClasses;

            _sgmlDomBuilder = new SgmlDomBuilder();
            _sgmlDomSerializer = new SgmlDomSerializer();
            _elementsScores = new Dictionary<XElement, float>();
            _articleContentDivIdHints2 = divIdHints;
        }