/// <summary>
        /// Converts each &lt;a&gt; and &lt;img&gt; uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        /// <param name="uri">The base uri</param>
        /// <param name="doc">The document to operate on</param>
        internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!String.IsNullOrWhiteSpace(href))
                {
                    // Remove links with javascript: URIs, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        // if the link only contains simple text content, it can be converted to a text node
                        if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text)
                        {
                            var text = doc.CreateTextNode(link.TextContent);
                            link.Parent.ReplaceChild(text, link);
                        }
                        else
                        {
                            // if the link has multiple children, they should all be preserved
                            var container = doc.CreateElement("span");
                            while (link.ChildNodes.Length > 0)
                            {
                                container.AppendChild(link.ChildNodes[0]);
                            }
                            link.Parent.ReplaceChild(container, link);
                        }
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" });

            NodeUtility.ForEachNode(imgs, (img) =>
            {
                var src = (img as IElement).GetAttribute("src");
                if (!String.IsNullOrWhiteSpace(src))
                {
                    (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src));
                }
            });
        }
Beispiel #2
0
        /// <summary>
        /// Converts each<a> and<img> uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        public static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!String.IsNullOrWhiteSpace(href))
                {
                    // Replace links with javascript: URIs with text content, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        var text = doc.CreateTextNode(link.TextContent);
                        link.Parent.ReplaceChild(text, link);
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" });

            NodeUtility.ForEachNode(imgs, (img) =>
            {
                var src = (img as IElement).GetAttribute("src");
                if (!String.IsNullOrWhiteSpace(src))
                {
                    (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src));
                }
            });
        }
Beispiel #3
0
        /// <summary>
        /// Converts each &lt;a&gt; and &lt;img&gt; uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        /// <param name="uri">The base uri</param>
        /// <param name="doc">The document to operate on</param>
        internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!string.IsNullOrWhiteSpace(href))
                {
                    // Remove links with javascript: URIs, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        // if the link only contains simple text content, it can be converted to a text node
                        if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text)
                        {
                            var text = doc.CreateTextNode(link.TextContent);
                            link.Parent.ReplaceChild(text, link);
                        }
                        else
                        {
                            // if the link has multiple children, they should all be preserved
                            var container = doc.CreateElement("span");
                            while (link.ChildNodes.Length > 0)
                            {
                                container.AppendChild(link.ChildNodes[0]);
                            }
                            link.Parent.ReplaceChild(container, link);
                        }
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var medias = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img", "picture", "figure", "video", "audio", "source" });

            NodeUtility.ForEachNode(medias, (media_node) => {
                if (media_node is IElement)
                {
                    var media  = media_node as IElement;
                    var src    = media.GetAttribute("src");
                    var poster = media.GetAttribute("poster");
                    var srcset = media.GetAttribute("srcset");

                    if (src != null)
                    {
                        media.SetAttribute("src", uri.ToAbsoluteURI(src));
                    }

                    if (poster != null)
                    {
                        media.SetAttribute("poster", uri.ToAbsoluteURI(poster));
                    }

                    if (srcset != null)
                    {
                        var newSrcset = RE_SrcSetUrl.Replace(srcset, (input) =>
                        {
                            return(uri.ToAbsoluteURI(input.Groups[1].Value) + (input.Groups[2]?.Value ?? "") + input.Groups[3].Value);
                        });

                        media.SetAttribute("srcset", newSrcset);
                    }
                }
            });
        }
Beispiel #4
0
        /// <summary>
        /// Try to extract metadata from JSON-LD object.
        /// For now, only Schema.org objects of type Article or its subtypes are supported.
        /// </summary>
        /// <param name="doc">The document</param>
        /// <returns>Dictionary with any metadata that could be extracted (possibly none)</returns>
        internal static Dictionary <string, string> GetJSONLD(IHtmlDocument doc)
        {
            var jsonLDMetadata = new Dictionary <string, string>();

            var scripts = NodeUtility.GetAllNodesWithTag(doc.DocumentElement, new string[] { "script" });

            var jsonLdElement = NodeUtility.FindNode(scripts, (el) => {
                return(el?.GetAttribute("type") == "application/ld+json");
            });

            if (jsonLdElement != null)
            {
                // Strip CDATA markers if present
                var content = Regex.Replace(jsonLdElement.TextContent, @"^\s*<!\[CDATA\[|\]\]>\$", "");
                try
                {
                    using (JsonDocument document = JsonDocument.Parse(content))
                    {
                        var         root = document.RootElement;
                        JsonElement value;

                        // JsonLD can contain an array of elements inside property @graph
                        if (!root.TryGetProperty("@type", out value) &&
                            root.TryGetProperty("@graph", out value))
                        {
                            var graph = value.EnumerateArray();
                            foreach (var obj in graph)
                            {
                                if (obj.TryGetProperty("@type", out value) &&
                                    RE_JsonLdArticleTypes.IsMatch(value.GetString()))
                                {
                                    root = obj;
                                    break;
                                }
                            }
                        }

                        if (!root.TryGetProperty("@context", out value) ||
                            !Regex.IsMatch(value.GetString(), @"^https?\:\/\/schema\.org$"))
                        {
                            return(jsonLDMetadata);
                        }

                        if (!root.TryGetProperty("@type", out value) ||
                            !RE_JsonLdArticleTypes.IsMatch(value.GetString()))
                        {
                            return(jsonLDMetadata);
                        }

                        if (root.TryGetProperty("name", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:title"] = value.GetString().Trim();
                        }
                        if (root.TryGetProperty("headline", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:title"] = value.GetString().Trim();
                        }
                        if (root.TryGetProperty("author", out value))
                        {
                            if (value.ValueKind == JsonValueKind.Object)
                            {
                                jsonLDMetadata["jsonld:author"] = value.GetProperty("name").GetString().Trim();
                            }
                            else if (value.ValueKind == JsonValueKind.Array &&
                                     value.EnumerateArray().ElementAt(0).GetProperty("name").ValueKind == JsonValueKind.String)
                            {
                                var           authors = root.GetProperty("author").EnumerateArray();
                                List <string> byline  = new List <string>();
                                foreach (var author in authors)
                                {
                                    if (author.TryGetProperty("name", out value) &&
                                        value.ValueKind == JsonValueKind.String)
                                    {
                                        byline.Add(value.GetString().Trim());
                                    }
                                }

                                jsonLDMetadata["jsonld:author"] = String.Join(", ", byline);
                            }
                        }

                        if (root.TryGetProperty("description", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:description"] = value.GetString().Trim();
                        }
                        if (root.TryGetProperty("publisher", out value) &&
                            value.ValueKind == JsonValueKind.Object)
                        {
                            jsonLDMetadata["jsonld:siteName"] = value.GetProperty("name").GetString().Trim();
                        }
                        if (root.TryGetProperty("datePublished", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:datePublished"] = value.GetProperty("datePublished").GetString();
                        }
                        if (root.TryGetProperty("image", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:image"] = value.GetProperty("image").GetString();
                        }
                    }
                }
                catch (Exception e)
                {
                }
            }
            return(jsonLDMetadata);
        }