/// <summary>
        /// 过滤出不需要的HtmlNode
        /// </summary>
        /// <param name="currentNode"></param>
        /// <param name="parentNode"></param>
        private void FilterInvalidNodes(HtmlNode currentNode, HtmlNode.Tag parentTag)
        {
            if (currentNode is HtmlNode.Tag)
            {
                var currentTag = currentNode as HtmlNode.Tag;

                if (_filterTags.Contains(currentTag.Name))
                {
                    parentTag.Children.Remove(currentTag);
                }
                else
                {
                    foreach (var childNode in currentTag.Children.Clone())
                    {
                        if (childNode is HtmlNode.Tag)
                        {
                            FilterInvalidNodes(childNode, currentTag);
                        }
                    }
                }
            }
            else if (currentNode is HtmlNode.Script)
            {
                parentTag.Children.Remove(currentNode);
            }
            else if (currentNode is HtmlNode.Comment)
            {
                parentTag.Children.Remove(currentNode);
            }
        }
        /// <summary>
        /// 构建一个a标签字符串
        /// </summary>
        /// <param name="tag"></param>
        /// <returns></returns>
        private string BuildHrefTagString(HtmlNode.Tag tag)
        {
            var href = tag.Attributes["href"];
            if (!href.StartsWith("http://") && !href.StartsWith("https://"))
            {
                href = _currentUrlDomain + href;
            }

            if (tag.Attributes.ContainsKey("target") && !string.IsNullOrEmpty(tag.Attributes["target"]))
            {
                return string.Format("<a href=\"{0}\" target=\"{1}\">", href, tag.Attributes["target"]);
            }
            else
            {
                return string.Format("<a href=\"{0}\">", href);
            }
        }
Пример #3
0
 public void Visit(HtmlNode.Text text)
 {
 }
Пример #4
0
 public void Visit(HtmlNode.Script script)
 {
 }
Пример #5
0
 public void Visit(HtmlNode.Comment comment)
 {
 }
Пример #6
0
        public void Visit(HtmlNode.Tag tag)
        {
            if (predicate(tag)) Result.Add(tag);

            tag.Children.ForEach(x => x.AcceptVisitor(this));
        }
 /// <summary>
 /// 修正第三方组件分析html后的一个Bug,
 /// 第三方组件Majestic13.HtmlParser的分析结果有时会将元素作为input元素的子元素,而实际上并不是子元素。
 /// 所以,需要做一个修改,就是把input元素下的子元素调整为input父元素的子元素。
 /// </summary>
 /// <param name="currentNode"></param>
 /// <param name="parentTag"></param>
 private void FixInputElementChildrenBug(HtmlNode currentNode, HtmlNode.Tag parentTag, ref bool flag)
 {
     if (currentNode is HtmlNode.Tag)
     {
         var currentTag = currentNode as HtmlNode.Tag;
         if (currentTag.Name == "input" && currentTag.Children.Count > 0)
         {
             parentTag.Children.Remove(currentTag);
             foreach (var childTag in currentTag.Children.Clone())
             {
                 parentTag.Children.Add(childTag);
             }
             flag = true;
         }
         else
         {
             foreach (var childNode in currentTag.Children.Clone())
             {
                 if (childNode is HtmlNode.Tag)
                 {
                     FixInputElementChildrenBug(childNode, currentTag, ref flag);
                 }
             }
         }
     }
 }
        /// <summary>
        /// 移除currentNode内的所有空HtmlNode
        /// </summary>
        /// <param name="currentNode">要扫描的HtmlNode</param>
        /// <param name="parentTag">要扫描的HtmlNode的父HtmlNode</param>
        private void RemoveEmptyTags(HtmlNode currentNode, HtmlNode.Tag parentTag)
        {
            if (currentNode is HtmlNode.Tag)
            {
                var currentTag = currentNode as HtmlNode.Tag;

                if (currentTag.IsEmptyTag())
                {
                    parentTag.Children.Remove(currentTag);
                }
                else
                {
                    foreach (var childNode in currentTag.Children.Clone())
                    {
                        RemoveEmptyTags(childNode, currentTag);
                    }
                    if (currentTag.IsEmptyTag())
                    {
                        parentTag.Children.Remove(currentTag);
                    }
                }
            }
        }
        /// <summary>
        /// 解析指定的一个Body元素
        /// </summary>
        /// <param name="bodyTag"></param>
        /// <param name="builder"></param>
        private void ProcessBodyElement(HtmlNode.Tag bodyTag, StringBuilder builder)
        {
            //修复input标签可能会包含子元素(如div,p)的Bug
            foreach (var childNode in bodyTag.Children.Clone())
            {
                var flag = false;
                do
                {
                    flag = false;
                    FixInputElementChildrenBug(childNode, bodyTag, ref flag);
                }
                while (flag);
            }

            //过滤出body中无效的标签
            foreach (var childNode in bodyTag.Children.Clone())
            {
                FilterInvalidNodes(childNode, bodyTag);
            }

            //过滤出body中的空标签
            foreach (var childNode in bodyTag.Children.Clone())
            {
                RemoveEmptyTags(childNode, bodyTag);
            }

            //将处理过的Body节点的所有子节点转换为html文本并返回
            foreach (var childNode in bodyTag.Children.Clone())
            {
                GenerateSimpleHtml(childNode, builder);
            }
        }
 /// <summary>
 /// 返回指定的Tag是否是一个有效的(具有href属性)a标签
 /// </summary>
 /// <param name="tag"></param>
 /// <returns></returns>
 private bool IsValidHrefTag(HtmlNode.Tag tag)
 {
     if (tag != null
         && tag.Name == "a"
         && tag.Attributes.Any(x => x.Key == "href" && !string.IsNullOrWhiteSpace(x.Value)))
     {
         return true;
     }
     return false;
 }
        /// <summary>
        /// 生成简单干净的html
        /// </summary>
        /// <param name="htmlNode"></param>
        /// <param name="builder"></param>
        private void GenerateSimpleHtml(HtmlNode htmlNode, StringBuilder builder)
        {
            if (htmlNode is HtmlNode.Text)
            {
                var value = (htmlNode as HtmlNode.Text).Value;
                if (!string.IsNullOrWhiteSpace(value))
                {
                    builder.Append(value.Trim());
                }
            }
            else if (htmlNode is HtmlNode.Comment)
            {
                var value = (htmlNode as HtmlNode.Comment).Value;
                if (!string.IsNullOrWhiteSpace(value))
                {
                    builder.Append(value.Trim());
                }
            }
            else if (htmlNode is HtmlNode.Tag)
            {
                var tag = htmlNode as HtmlNode.Tag;

                if (IsValidHrefTag(tag))
                {
                    builder.Append(BuildHrefTagString(tag));
                }
                else
                {
                    builder.Append(string.Format(_tagStart, tag.Name));
                }

                foreach (var childTag in tag.Children)
                {
                    GenerateSimpleHtml(childTag, builder);
                }

                builder.Append(string.Format(_tagEnd, tag.Name));
                if (tag.Name == "a" || tag.Name == "span")
                {
                    builder.Append("&nbsp;");
                }
            }
        }
 /// <summary>
 /// 生成纯文本
 /// </summary>
 /// <param name="htmlNode"></param>
 /// <param name="builder"></param>
 private void GenerateRawText(HtmlNode htmlNode, StringBuilder builder)
 {
     if (htmlNode is HtmlNode.Text)
     {
         var value = (htmlNode as HtmlNode.Text).Value;
         if (string.IsNullOrWhiteSpace(value))
         {
             return;
         }
         builder.Append(value.Trim());
         builder.Append(" "); //文本之间加一个空格
     }
     else if (htmlNode is HtmlNode.Tag)
     {
         foreach (var child in (htmlNode as HtmlNode.Tag).Children)
         {
             GenerateRawText(child, builder);
         }
     }
 }