/// <summary> /// Converts each <a> and <img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> /// <param name="uri">The base uri</param> /// <param name="doc">The document to operate on</param> internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!String.IsNullOrWhiteSpace(href)) { // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { // if the link only contains simple text content, it can be converted to a text node if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { // if the link has multiple children, they should all be preserved var container = doc.CreateElement("span"); while (link.ChildNodes.Length > 0) { container.AppendChild(link.ChildNodes[0]); } link.Parent.ReplaceChild(container, link); } } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" }); NodeUtility.ForEachNode(imgs, (img) => { var src = (img as IElement).GetAttribute("src"); if (!String.IsNullOrWhiteSpace(src)) { (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src)); } }); }
/// <summary> /// Converts each<a> and<img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> public static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!String.IsNullOrWhiteSpace(href)) { // Replace links with javascript: URIs with text content, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" }); NodeUtility.ForEachNode(imgs, (img) => { var src = (img as IElement).GetAttribute("src"); if (!String.IsNullOrWhiteSpace(src)) { (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src)); } }); }
/// <summary> /// Converts each <a> and <img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> /// <param name="uri">The base uri</param> /// <param name="doc">The document to operate on</param> internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { // if the link only contains simple text content, it can be converted to a text node if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { // if the link has multiple children, they should all be preserved var container = doc.CreateElement("span"); while (link.ChildNodes.Length > 0) { container.AppendChild(link.ChildNodes[0]); } link.Parent.ReplaceChild(container, link); } } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var medias = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img", "picture", "figure", "video", "audio", "source" }); NodeUtility.ForEachNode(medias, (media_node) => { if (media_node is IElement) { var media = media_node as IElement; var src = media.GetAttribute("src"); var poster = media.GetAttribute("poster"); var srcset = media.GetAttribute("srcset"); if (src != null) { media.SetAttribute("src", uri.ToAbsoluteURI(src)); } if (poster != null) { media.SetAttribute("poster", uri.ToAbsoluteURI(poster)); } if (srcset != null) { var newSrcset = RE_SrcSetUrl.Replace(srcset, (input) => { return(uri.ToAbsoluteURI(input.Groups[1].Value) + (input.Groups[2]?.Value ?? "") + input.Groups[3].Value); }); media.SetAttribute("srcset", newSrcset); } } }); }
/// <summary> /// Try to extract metadata from JSON-LD object. /// For now, only Schema.org objects of type Article or its subtypes are supported. /// </summary> /// <param name="doc">The document</param> /// <returns>Dictionary with any metadata that could be extracted (possibly none)</returns> internal static Dictionary <string, string> GetJSONLD(IHtmlDocument doc) { var jsonLDMetadata = new Dictionary <string, string>(); var scripts = NodeUtility.GetAllNodesWithTag(doc.DocumentElement, new string[] { "script" }); var jsonLdElement = NodeUtility.FindNode(scripts, (el) => { return(el?.GetAttribute("type") == "application/ld+json"); }); if (jsonLdElement != null) { // Strip CDATA markers if present var content = Regex.Replace(jsonLdElement.TextContent, @"^\s*<!\[CDATA\[|\]\]>\$", ""); try { using (JsonDocument document = JsonDocument.Parse(content)) { var root = document.RootElement; JsonElement value; // JsonLD can contain an array of elements inside property @graph if (!root.TryGetProperty("@type", out value) && root.TryGetProperty("@graph", out value)) { var graph = value.EnumerateArray(); foreach (var obj in graph) { if (obj.TryGetProperty("@type", out value) && RE_JsonLdArticleTypes.IsMatch(value.GetString())) { root = obj; break; } } } if (!root.TryGetProperty("@context", out value) || !Regex.IsMatch(value.GetString(), @"^https?\:\/\/schema\.org$")) { return(jsonLDMetadata); } if (!root.TryGetProperty("@type", out value) || !RE_JsonLdArticleTypes.IsMatch(value.GetString())) { return(jsonLDMetadata); } if (root.TryGetProperty("name", out value) && value.ValueKind == JsonValueKind.String) { jsonLDMetadata["jsonld:title"] = value.GetString().Trim(); } if (root.TryGetProperty("headline", out value) && value.ValueKind == JsonValueKind.String) { jsonLDMetadata["jsonld:title"] = value.GetString().Trim(); } if (root.TryGetProperty("author", out value)) { if (value.ValueKind == JsonValueKind.Object) { jsonLDMetadata["jsonld:author"] = value.GetProperty("name").GetString().Trim(); } else if (value.ValueKind == JsonValueKind.Array && value.EnumerateArray().ElementAt(0).GetProperty("name").ValueKind == JsonValueKind.String) { var authors = root.GetProperty("author").EnumerateArray(); List <string> byline = new List <string>(); foreach (var author in authors) { if (author.TryGetProperty("name", out value) && value.ValueKind == JsonValueKind.String) { byline.Add(value.GetString().Trim()); } } jsonLDMetadata["jsonld:author"] = String.Join(", ", byline); } } if (root.TryGetProperty("description", out value) && value.ValueKind == JsonValueKind.String) { jsonLDMetadata["jsonld:description"] = value.GetString().Trim(); } if (root.TryGetProperty("publisher", out value) && value.ValueKind == JsonValueKind.Object) { jsonLDMetadata["jsonld:siteName"] = value.GetProperty("name").GetString().Trim(); } if (root.TryGetProperty("datePublished", out value) && value.ValueKind == JsonValueKind.String) { jsonLDMetadata["jsonld:datePublished"] = value.GetProperty("datePublished").GetString(); } if (root.TryGetProperty("image", out value) && value.ValueKind == JsonValueKind.String) { jsonLDMetadata["jsonld:image"] = value.GetProperty("image").GetString(); } } } catch (Exception e) { } } return(jsonLDMetadata); }