/// <summary> /// 过滤出不需要的HtmlNode /// </summary> /// <param name="currentNode"></param> /// <param name="parentNode"></param> private void FilterInvalidNodes(HtmlNode currentNode, HtmlNode.Tag parentTag) { if (currentNode is HtmlNode.Tag) { var currentTag = currentNode as HtmlNode.Tag; if (_filterTags.Contains(currentTag.Name)) { parentTag.Children.Remove(currentTag); } else { foreach (var childNode in currentTag.Children.Clone()) { if (childNode is HtmlNode.Tag) { FilterInvalidNodes(childNode, currentTag); } } } } else if (currentNode is HtmlNode.Script) { parentTag.Children.Remove(currentNode); } else if (currentNode is HtmlNode.Comment) { parentTag.Children.Remove(currentNode); } }
/// <summary> /// 构建一个a标签字符串 /// </summary> /// <param name="tag"></param> /// <returns></returns> private string BuildHrefTagString(HtmlNode.Tag tag) { var href = tag.Attributes["href"]; if (!href.StartsWith("http://") && !href.StartsWith("https://")) { href = _currentUrlDomain + href; } if (tag.Attributes.ContainsKey("target") && !string.IsNullOrEmpty(tag.Attributes["target"])) { return string.Format("<a href=\"{0}\" target=\"{1}\">", href, tag.Attributes["target"]); } else { return string.Format("<a href=\"{0}\">", href); } }
public void Visit(HtmlNode.Text text) { }
public void Visit(HtmlNode.Script script) { }
public void Visit(HtmlNode.Comment comment) { }
public void Visit(HtmlNode.Tag tag) { if (predicate(tag)) Result.Add(tag); tag.Children.ForEach(x => x.AcceptVisitor(this)); }
/// <summary> /// 修正第三方组件分析html后的一个Bug, /// 第三方组件Majestic13.HtmlParser的分析结果有时会将元素作为input元素的子元素,而实际上并不是子元素。 /// 所以,需要做一个修改,就是把input元素下的子元素调整为input父元素的子元素。 /// </summary> /// <param name="currentNode"></param> /// <param name="parentTag"></param> private void FixInputElementChildrenBug(HtmlNode currentNode, HtmlNode.Tag parentTag, ref bool flag) { if (currentNode is HtmlNode.Tag) { var currentTag = currentNode as HtmlNode.Tag; if (currentTag.Name == "input" && currentTag.Children.Count > 0) { parentTag.Children.Remove(currentTag); foreach (var childTag in currentTag.Children.Clone()) { parentTag.Children.Add(childTag); } flag = true; } else { foreach (var childNode in currentTag.Children.Clone()) { if (childNode is HtmlNode.Tag) { FixInputElementChildrenBug(childNode, currentTag, ref flag); } } } } }
/// <summary> /// 移除currentNode内的所有空HtmlNode /// </summary> /// <param name="currentNode">要扫描的HtmlNode</param> /// <param name="parentTag">要扫描的HtmlNode的父HtmlNode</param> private void RemoveEmptyTags(HtmlNode currentNode, HtmlNode.Tag parentTag) { if (currentNode is HtmlNode.Tag) { var currentTag = currentNode as HtmlNode.Tag; if (currentTag.IsEmptyTag()) { parentTag.Children.Remove(currentTag); } else { foreach (var childNode in currentTag.Children.Clone()) { RemoveEmptyTags(childNode, currentTag); } if (currentTag.IsEmptyTag()) { parentTag.Children.Remove(currentTag); } } } }
/// <summary> /// 解析指定的一个Body元素 /// </summary> /// <param name="bodyTag"></param> /// <param name="builder"></param> private void ProcessBodyElement(HtmlNode.Tag bodyTag, StringBuilder builder) { //修复input标签可能会包含子元素(如div,p)的Bug foreach (var childNode in bodyTag.Children.Clone()) { var flag = false; do { flag = false; FixInputElementChildrenBug(childNode, bodyTag, ref flag); } while (flag); } //过滤出body中无效的标签 foreach (var childNode in bodyTag.Children.Clone()) { FilterInvalidNodes(childNode, bodyTag); } //过滤出body中的空标签 foreach (var childNode in bodyTag.Children.Clone()) { RemoveEmptyTags(childNode, bodyTag); } //将处理过的Body节点的所有子节点转换为html文本并返回 foreach (var childNode in bodyTag.Children.Clone()) { GenerateSimpleHtml(childNode, builder); } }
/// <summary> /// 返回指定的Tag是否是一个有效的(具有href属性)a标签 /// </summary> /// <param name="tag"></param> /// <returns></returns> private bool IsValidHrefTag(HtmlNode.Tag tag) { if (tag != null && tag.Name == "a" && tag.Attributes.Any(x => x.Key == "href" && !string.IsNullOrWhiteSpace(x.Value))) { return true; } return false; }
/// <summary> /// 生成简单干净的html /// </summary> /// <param name="htmlNode"></param> /// <param name="builder"></param> private void GenerateSimpleHtml(HtmlNode htmlNode, StringBuilder builder) { if (htmlNode is HtmlNode.Text) { var value = (htmlNode as HtmlNode.Text).Value; if (!string.IsNullOrWhiteSpace(value)) { builder.Append(value.Trim()); } } else if (htmlNode is HtmlNode.Comment) { var value = (htmlNode as HtmlNode.Comment).Value; if (!string.IsNullOrWhiteSpace(value)) { builder.Append(value.Trim()); } } else if (htmlNode is HtmlNode.Tag) { var tag = htmlNode as HtmlNode.Tag; if (IsValidHrefTag(tag)) { builder.Append(BuildHrefTagString(tag)); } else { builder.Append(string.Format(_tagStart, tag.Name)); } foreach (var childTag in tag.Children) { GenerateSimpleHtml(childTag, builder); } builder.Append(string.Format(_tagEnd, tag.Name)); if (tag.Name == "a" || tag.Name == "span") { builder.Append(" "); } } }
/// <summary> /// 生成纯文本 /// </summary> /// <param name="htmlNode"></param> /// <param name="builder"></param> private void GenerateRawText(HtmlNode htmlNode, StringBuilder builder) { if (htmlNode is HtmlNode.Text) { var value = (htmlNode as HtmlNode.Text).Value; if (string.IsNullOrWhiteSpace(value)) { return; } builder.Append(value.Trim()); builder.Append(" "); //文本之间加一个空格 } else if (htmlNode is HtmlNode.Tag) { foreach (var child in (htmlNode as HtmlNode.Tag).Children) { GenerateRawText(child, builder); } } }