Example #1
0
 private void ReplaceChildElementByText(INode parent, IElement child, IHtmlDocument document)
 {
     if (!string.IsNullOrEmpty(child.TextContent))
     {
         // Add a text content in place of the element
         var newElement = document.CreateTextNode(child.TextContent);
         parent.ReplaceChild(newElement, child);
     }
     else
     {
         // If no content then drop the element
         parent.RemoveChild(child);
     }
 }
Example #2
0
        /// <summary>
        /// Converts each<a> and<img> uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        public static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!String.IsNullOrWhiteSpace(href))
                {
                    // Replace links with javascript: URIs with text content, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        var text = doc.CreateTextNode(link.TextContent);
                        link.Parent.ReplaceChild(text, link);
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" });

            NodeUtility.ForEachNode(imgs, (img) =>
            {
                var src = (img as IElement).GetAttribute("src");
                if (!String.IsNullOrWhiteSpace(src))
                {
                    (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src));
                }
            });
        }
Example #3
0
        /// <summary>
        /// Converts each &lt;a&gt; and &lt;img&gt; uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        /// <param name="uri">The base uri</param>
        /// <param name="doc">The document to operate on</param>
        internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!string.IsNullOrWhiteSpace(href))
                {
                    // Remove links with javascript: URIs, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        // if the link only contains simple text content, it can be converted to a text node
                        if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text)
                        {
                            var text = doc.CreateTextNode(link.TextContent);
                            link.Parent.ReplaceChild(text, link);
                        }
                        else
                        {
                            // if the link has multiple children, they should all be preserved
                            var container = doc.CreateElement("span");
                            while (link.ChildNodes.Length > 0)
                            {
                                container.AppendChild(link.ChildNodes[0]);
                            }
                            link.Parent.ReplaceChild(container, link);
                        }
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var medias = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img", "picture", "figure", "video", "audio", "source" });

            NodeUtility.ForEachNode(medias, (media_node) => {
                if (media_node is IElement)
                {
                    var media  = media_node as IElement;
                    var src    = media.GetAttribute("src");
                    var poster = media.GetAttribute("poster");
                    var srcset = media.GetAttribute("srcset");

                    if (src != null)
                    {
                        media.SetAttribute("src", uri.ToAbsoluteURI(src));
                    }

                    if (poster != null)
                    {
                        media.SetAttribute("poster", uri.ToAbsoluteURI(poster));
                    }

                    if (srcset != null)
                    {
                        var newSrcset = RE_SrcSetUrl.Replace(srcset, (input) =>
                        {
                            return(uri.ToAbsoluteURI(input.Groups[1].Value) + (input.Groups[2]?.Value ?? "") + input.Groups[3].Value);
                        });

                        media.SetAttribute("srcset", newSrcset);
                    }
                }
            });
        }