private void ReplaceChildElementByText(INode parent, IElement child, IHtmlDocument document) { if (!string.IsNullOrEmpty(child.TextContent)) { // Add a text content in place of the element var newElement = document.CreateTextNode(child.TextContent); parent.ReplaceChild(newElement, child); } else { // If no content then drop the element parent.RemoveChild(child); } }
/// <summary> /// Converts each<a> and<img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> public static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!String.IsNullOrWhiteSpace(href)) { // Replace links with javascript: URIs with text content, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" }); NodeUtility.ForEachNode(imgs, (img) => { var src = (img as IElement).GetAttribute("src"); if (!String.IsNullOrWhiteSpace(src)) { (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src)); } }); }
/// <summary> /// Converts each <a> and <img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> /// <param name="uri">The base uri</param> /// <param name="doc">The document to operate on</param> internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { // if the link only contains simple text content, it can be converted to a text node if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { // if the link has multiple children, they should all be preserved var container = doc.CreateElement("span"); while (link.ChildNodes.Length > 0) { container.AppendChild(link.ChildNodes[0]); } link.Parent.ReplaceChild(container, link); } } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var medias = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img", "picture", "figure", "video", "audio", "source" }); NodeUtility.ForEachNode(medias, (media_node) => { if (media_node is IElement) { var media = media_node as IElement; var src = media.GetAttribute("src"); var poster = media.GetAttribute("poster"); var srcset = media.GetAttribute("srcset"); if (src != null) { media.SetAttribute("src", uri.ToAbsoluteURI(src)); } if (poster != null) { media.SetAttribute("poster", uri.ToAbsoluteURI(poster)); } if (srcset != null) { var newSrcset = RE_SrcSetUrl.Replace(srcset, (input) => { return(uri.ToAbsoluteURI(input.Groups[1].Value) + (input.Groups[2]?.Value ?? "") + input.Groups[3].Value); }); media.SetAttribute("srcset", newSrcset); } } }); }