Example #1
0
        /// <summary>
        /// Simplify nested elements
        /// </summary>
        /// <param name="articleContent">The document</param>
        /// <returns>
        /// The clean title
        /// </returns>
        internal static void SimplifyNestedElements(IElement articleContent)
        {
            var node = articleContent;

            while (node != null)
            {
                if (node.Parent != null && (new string[] { "DIV", "SECTION" }).Contains(node.TagName) && !(!String.IsNullOrWhiteSpace(node.Id) && node.Id.StartsWith("readability")))
                {
                    if (NodeUtility.IsElementWithoutContent(node))
                    {
                        node = NodeUtility.RemoveAndGetNext(node) as IElement;
                        continue;
                    }
                    else if (NodeUtility.HasSingleTagInsideElement(node, "DIV") || NodeUtility.HasSingleTagInsideElement(node, "SECTION"))
                    {
                        var child = node.Children[0];
                        for (var i = 0; i < node.Attributes.Length; i++)
                        {
                            child.SetAttribute(node.Attributes[i].Name, node.Attributes[i].Value);
                        }
                        node.Parent.ReplaceChild(child, node);
                        node = child;
                        continue;
                    }
                }

                node = NodeUtility.GetNextNode(node);
            }
        }
Example #2
0
 /// <summary>
 /// Removes script tags from the element
 /// </summary>
 /// <param name="element">The element to operate on</param>
 internal static void RemoveScripts(IElement element)
 {
     NodeUtility.RemoveNodes(element.GetElementsByTagName("script"), (scriptNode) =>
     {
         scriptNode.NodeValue = "";
         scriptNode.RemoveAttribute("src");
         return(true);
     });
     NodeUtility.RemoveNodes(element.GetElementsByTagName("noscript"));
 }
Example #3
0
        /// <summary>
        /// Determine whether element has any children block level elements.
        /// </summary>
        /// <param name="element">Element to operate on</param>
        /// <returns>bool</returns>
        internal static bool HasChildBlockElement(IElement element)
        {
            var b = NodeUtility.SomeNode(element?.ChildNodes, (node) =>
            {
                return(divToPElems.ToList().IndexOf((node as IElement)?.TagName) != -1 ||
                       HasChildBlockElement(node as IElement));
            });
            var d = element?.TextContent;

            return(b);
        }
Example #4
0
        /// <summary>
        /// Converts each &lt;a&gt; and &lt;img&gt; uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        /// <param name="uri">The base uri</param>
        /// <param name="doc">The document to operate on</param>
        internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!String.IsNullOrWhiteSpace(href))
                {
                    // Remove links with javascript: URIs, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        // if the link only contains simple text content, it can be converted to a text node
                        if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text)
                        {
                            var text = doc.CreateTextNode(link.TextContent);
                            link.Parent.ReplaceChild(text, link);
                        }
                        else
                        {
                            // if the link has multiple children, they should all be preserved
                            var container = doc.CreateElement("span");
                            while (link.ChildNodes.Length > 0)
                            {
                                container.AppendChild(link.ChildNodes[0]);
                            }
                            link.Parent.ReplaceChild(container, link);
                        }
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" });

            NodeUtility.ForEachNode(imgs, (img) =>
            {
                var src = (img as IElement).GetAttribute("src");
                if (!String.IsNullOrWhiteSpace(src))
                {
                    (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src));
                }
            });
        }
Example #5
0
        /// <summary>
        /// Check if this node has only whitespace and a single element with given tag
        /// Returns false if the DIV node contains non-empty text nodes
        /// or if it contains no element with given tag or more than 1 element.
        /// </summary>
        /// <param name="element">Element to operate on</param>
        /// <param name="tag">Tag of the child element</param>
        /// <returns>bool</returns>
        internal static bool HasSingleTagInsideElement(IElement element, string tag)
        {
            // There should be exactly 1 element child with given tag:
            if (element.Children.Length != 1 || element.Children[0].TagName != tag)
            {
                return(false);
            }

            // And there should be no text nodes with real content
            return(!NodeUtility.SomeNode(element.ChildNodes, (node) =>
            {
                return node.NodeType == NodeType.Text &&
                regExps["hasContent"].IsMatch(node.TextContent);
            }));
        }
Example #6
0
        /// <summary>
        /// Clean out elements that match the specified conditions
        /// </summary>
        /// <param name="e">Element to operate on</param>
        /// <param name="filter">Filter function on match id/class combination</param>
        internal static void CleanMatchedNodes(IElement e, Func <IElement, string, bool> filter = null)
        {
            var endOfSearchMarkerNode = NodeUtility.GetNextNode(e, true);
            var next = NodeUtility.GetNextNode(e);

            while (next != null && next != endOfSearchMarkerNode)
            {
                if (filter(next, next.ClassName + " " + next.Id))
                {
                    next = NodeUtility.RemoveAndGetNext(next as INode) as IElement;
                }
                else
                {
                    next = NodeUtility.GetNextNode(next);
                }
            }
        }
Example #7
0
        /// <summary>
        /// <para>Get the density of links as a percentage of the content.</para>
        /// <para>This is the amount of text that is inside a link divided by the totaltextinthenode.</para>
        /// </summary>
        /// <param name="element">Element to operate on</param>
        internal static float GetLinkDensity(IElement element)
        {
            var textLength = NodeUtility.GetInnerText(element).Length;

            if (textLength == 0)
            {
                return(0);
            }

            float linkLength = 0;

            // XXX implement _reduceNodeList?
            NodeUtility.ForEachNode(element.GetElementsByTagName("a"), (linkNode) =>
            {
                linkLength += NodeUtility.GetInnerText(linkNode as IElement).Length;
            });

            return(linkLength / textLength);
        }
Example #8
0
        /// <summary>
        /// Converts each<a> and<img> uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        public static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!String.IsNullOrWhiteSpace(href))
                {
                    // Replace links with javascript: URIs with text content, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        var text = doc.CreateTextNode(link.TextContent);
                        link.Parent.ReplaceChild(text, link);
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" });

            NodeUtility.ForEachNode(imgs, (img) =>
            {
                var src = (img as IElement).GetAttribute("src");
                if (!String.IsNullOrWhiteSpace(src))
                {
                    (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src));
                }
            });
        }
Example #9
0
        /**
         * Attempts to get metadata for the article.
         *
         * @return void
         */
        public static Metadata GetArticleMetadata(IHtmlDocument doc, Uri uri, string language)
        {
            Metadata metadata = new Metadata();
            Dictionary <string, string> values = new Dictionary <string, string>();
            var metaElements = doc.GetElementsByTagName("meta");

            // Match "description", or Twitter's "twitter:description" (Cards)
            // in name attribute.
            // name is a single value
            var namePattern = @"^\s*((?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)|name)\s*$";

            // Match Facebook's Open Graph title & description properties.
            // property is a space-separated list of values
            var propertyPattern = @"\s*(dc|dcterm|og|twitter|article)\s*:\s*(author|creator|description|title|published_time|image|site_name)(\s+|$)";

            var itemPropPattern = @"\s*datePublished\s*";

            // Find description tags.
            NodeUtility.ForEachNode(metaElements, (element) =>
            {
                var elementName     = (element as IElement).GetAttribute("name") ?? "";
                var elementProperty = (element as IElement).GetAttribute("property") ?? "";
                var itemProp        = (element as IElement).GetAttribute("itemprop") ?? "";
                var content         = (element as IElement).GetAttribute("content");

                // avoid issues with no meta tags
                if (String.IsNullOrEmpty(content))
                {
                    return;
                }
                MatchCollection matches = null;
                String name             = "";

                if (new string[] { elementName, elementProperty, itemProp }.ToList().IndexOf("author") != -1)
                {
                    metadata.Byline = (element as IElement).GetAttribute("content");
                    metadata.Author = (element as IElement).GetAttribute("content");
                    return;
                }

                if (!String.IsNullOrEmpty(elementProperty))
                {
                    matches = Regex.Matches(elementProperty, propertyPattern);
                    if (matches.Count > 0)
                    {
                        for (int i = matches.Count - 1; i >= 0; i--)
                        {
                            // Convert to lowercase, and remove any whitespace
                            // so we can match below.
                            name = Regex.Replace(matches[i].Value.ToLower(), @"\s+", "");

                            // multiple authors
                            values[name] = content.Trim();
                        }
                    }
                }

                if ((matches == null || matches.Count == 0) &&
                    !String.IsNullOrEmpty(elementName) && Regex.IsMatch(elementName, namePattern, RegexOptions.IgnoreCase))
                {
                    name = elementName;
                    if (!String.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase, remove any whitespace, and convert dots
                        // to colons so we can match below.
                        name         = Regex.Replace(Regex.Replace(name.ToLower(), @"\s+", ""), @"\.", ":");
                        values[name] = content.Trim();
                    }
                }
                else if (Regex.IsMatch(elementProperty, propertyPattern, RegexOptions.IgnoreCase))
                {
                    name = elementProperty;
                }
                else if (Regex.IsMatch(itemProp, itemPropPattern, RegexOptions.IgnoreCase))
                {
                    name = itemProp;
                }

                if (!String.IsNullOrEmpty(name))
                {
                    content = (element as IElement).GetAttribute("content");
                    if (!String.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase and remove any whitespace
                        // so we can match below.
                        name = Regex.Replace(name.ToLower(), @"\s", "", RegexOptions.IgnoreCase);
                        if (!values.ContainsKey(name))
                        {
                            values.Add(name, content.Trim());
                        }
                    }
                }
            });

            // Find the the description of the article
            IEnumerable <string> DescriptionKeys()
            {
                yield return(values.ContainsKey("description") ? values["description"] : null);

                yield return(values.ContainsKey("dc:description") ? values["dc:description"] : null);

                yield return(values.ContainsKey("dcterm:description") ? values["dcterm:description"] : null);

                yield return(values.ContainsKey("og:description") ? values["og:description"] : null);

                yield return(values.ContainsKey("weibo:article:description") ? values["weibo:article:description"] : null);

                yield return(values.ContainsKey("weibo:webpage:description") ? values["weibo:webpage:description"] : null);

                yield return(values.ContainsKey("twitter:description") ? values["twitter:description"] : null);
            }

            metadata.Excerpt = DescriptionKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Get the name of the site
            if (values.ContainsKey("og:site_name"))
            {
                metadata.SiteName = values["og:site_name"];
            }

            // Find the title of the article
            IEnumerable <string> TitleKeys()
            {
                yield return(values.ContainsKey("dc:title") ? values["dc:title"] : null);

                yield return(values.ContainsKey("dcterm:title") ? values["dcterm:title"] : null);

                yield return(values.ContainsKey("og:title") ? values["og:title"] : null);

                yield return(values.ContainsKey("weibo:article:title") ? values["weibo:article:title"] : null);

                yield return(values.ContainsKey("weibo:webpage:title") ? values["weibo:webpage:title"] : null);

                yield return(values.ContainsKey("twitter:title") ? values["twitter:title"] : null);

                yield return(values.ContainsKey("title") ? values["title"] : null);
            }

            metadata.Title = TitleKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Let's try to eliminate the site name from the title
            metadata.Title = Readability.CleanTitle(metadata.Title, metadata.SiteName);

            // We did not find any title,
            // we try to get it from the title tag
            if (String.IsNullOrEmpty(metadata.Title))
            {
                metadata.Title = Readability.GetArticleTitle(doc);
            }

            // added language extraction
            IEnumerable <string> LanguageHeuristics()
            {
                yield return(language);

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("lang"));

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("xml:lang"));

                yield return(doc.QuerySelector("meta[http-equiv=\"Content-Language\"]")?.GetAttribute("content"));

                // this is wrong, but it's used
                yield return(doc.QuerySelector("meta[name=\"lang\"]")?.GetAttribute("value"));
            }

            metadata.Language = LanguageHeuristics().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";


            // Find the featured image of the article
            IEnumerable <string> FeaturedImageKeys()
            {
                yield return(values.ContainsKey("og:image") ? values["og:image"] : null);

                yield return(values.ContainsKey("twitter:image") ? values["twitter:image"] : null);

                yield return(values.ContainsKey("weibo:article:image") ? values["weibo:article:image"] : null);

                yield return(values.ContainsKey("weibo:webpage:image") ? values["weibo:webpage:image"] : null);
            }

            metadata.FeaturedImage = FeaturedImageKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            if (String.IsNullOrEmpty(metadata.Author))
            {
                // We try to find a meta tag for the author.
                // Note that there is Open Grapg tag for an author,
                // but it usually contains a profile URL of the author.
                // So we do not use it
                IEnumerable <string> AuthorKeys()
                {
                    yield return(values.ContainsKey("dc:creator") ? values["dc:creator"] : null);

                    yield return(values.ContainsKey("dcterm:creator") ? values["dcterm:creator"] : null);

                    yield return(values.ContainsKey("author") ? values["author"] : null);
                }

                metadata.Author = AuthorKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";
            }

            // added date extraction
            DateTime date;

            // added language extraction
            IEnumerable <DateTime?> DateHeuristics()
            {
                yield return(values.ContainsKey("article:published_time") &&
                             DateTime.TryParse(values["article:published_time"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("date") &&
                             DateTime.TryParse(values["date"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("datepublished") &&
                             DateTime.TryParse(values["datepublished"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:article:create_at") &&
                             DateTime.TryParse(values["weibo:article:create_at"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:webpage:create_at") &&
                             DateTime.TryParse(values["weibo:webpage:create_at"], out date) ?
                             date : DateTime.MinValue);
            }

            metadata.PublicationDate = DateHeuristics().FirstOrDefault(d => d != DateTime.MinValue);

            if (metadata.PublicationDate == null)
            {
                var times = doc.GetElementsByTagName("time");

                Console.WriteLine($"times: {times.Length}");

                foreach (var time in times)
                {
                    if (!String.IsNullOrEmpty(time.GetAttribute("pubDate")) &&
                        DateTime.TryParse(time.GetAttribute("datetime"), out date))
                    {
                        metadata.PublicationDate = date;
                    }
                }
            }

            if (metadata.PublicationDate == null)
            {
                // as a last resort check the URL for a date
                Match maybeDate = Regex.Match(uri.PathAndQuery, "/(?<year>[0-9]{4})/(?<month>[0-9]{2})/(?<day>[0-9]{2})?");
                if (maybeDate.Success)
                {
                    metadata.PublicationDate = new DateTime(int.Parse(maybeDate.Groups["year"].Value),
                                                            int.Parse(maybeDate.Groups["month"].Value),
                                                            !String.IsNullOrEmpty(maybeDate.Groups["day"].Value) ? int.Parse(maybeDate.Groups["day"].Value) : 1);
                }
            }

            return(metadata);
        }
Example #10
0
        /// <summary>
        /// Get the article title
        /// </summary>
        /// <param name="doc">The document</param>
        /// <returns>
        /// The clean title
        /// </returns>
        public static string GetArticleTitle(IHtmlDocument doc)
        {
            var curTitle  = "";
            var origTitle = "";

            try
            {
                curTitle = origTitle = doc.Title.Trim();

                // If they had an element with id "title" in their HTML
                if (typeof(string) != curTitle.GetType())
                {
                    curTitle = origTitle = NodeUtility.GetInnerText(doc.GetElementsByTagName("title")[0]);
                }
            }
            catch (Exception e) { /* ignore exceptions setting the title. */ }

            var titleHadHierarchicalSeparators = false;

            int wordCount(String str)
            {
                return(Regex.Split(str, @"\s+").Length);
            }

            // If there's a separator in the title, first remove the final part
            if (curTitle.IndexOfAny(new char[] { '|', '-', '»', '/', '>' }) != -1)
            {
                titleHadHierarchicalSeparators = curTitle.IndexOfAny(new char[] { '\\', '»', '/', '>' }) != -1;
                curTitle = Regex.Replace(origTitle, @"(.*) [\|\-\\\/>»] .*", "$1", RegexOptions.IgnoreCase);

                // If the resulting title is too short (3 words or fewer), remove
                // the first part instead:
                if (wordCount(curTitle) < 3)
                {
                    curTitle = Regex.Replace(origTitle, @"[^\|\-\\\/>»]* [\|\-\\\/>»](.*)", "$1", RegexOptions.IgnoreCase);
                }
            }
            else if (curTitle.IndexOf(": ") != -1)
            {
                // Check if we have an heading containing this exact string, so we
                // could assume it's the full title.
                var headings = NodeUtility.ConcatNodeLists(
                    doc.GetElementsByTagName("h1"),
                    doc.GetElementsByTagName("h2")
                    );
                var trimmedTitle = curTitle.Trim();
                var match        = NodeUtility.SomeNode(headings, (heading) =>
                {
                    return(heading.TextContent.Trim() == trimmedTitle);
                });

                // If we don't, let's extract the title out of the original title string.
                if (!match)
                {
                    curTitle = origTitle.Substring(origTitle.LastIndexOf(':') + 1);

                    // If the title is now too short, try the first colon instead:
                    if (wordCount(curTitle) < 3)
                    {
                        curTitle = origTitle.Substring(origTitle.IndexOf(':') + 1);
                    }
                }
            }
            else if (curTitle.Length > 150 || curTitle.Length < 15)
            {
                var hOnes = doc.GetElementsByTagName("h1");

                if (hOnes.Length == 1)
                {
                    curTitle = NodeUtility.GetInnerText(hOnes[0]);
                }
            }

            curTitle = curTitle.Trim();

            // If we now have 4 words or fewer as our title, and either no
            // 'hierarchical' separators (\, /, > or ») were found in the original
            // title or we decreased the number of words by more than 1 word, use
            // the original title.
            var curTitleWordCount = wordCount(curTitle);

            if (curTitleWordCount <= 4 && (
                    !titleHadHierarchicalSeparators ||
                    curTitleWordCount != wordCount(Regex.Replace(origTitle, @"[\|\-\\\/>»: ]+", " ", RegexOptions.IgnoreCase)) - 1))
            {
                curTitle = origTitle;
            }

            return(curTitle);
        }
Example #11
0
 /// <summary>
 /// Determine if a node qualifies as phrasing content, which is defined at https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
 /// </summary>
 /// <param name="node">Node to operate on</param>
 /// <returns>bool</returns>
 internal static bool IsPhrasingContent(INode node)
 {
     return(node.NodeType == NodeType.Text || Array.IndexOf(phrasingElems, node.NodeName) != -1 ||
            ((node.NodeName == "A" || node.NodeName == "DEL" || node.NodeName == "INS") &&
             NodeUtility.EveryNode(node.ChildNodes, IsPhrasingContent)));
 }
Example #12
0
        /// <summary>
        /// Converts each &lt;a&gt; and &lt;img&gt; uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        /// <param name="uri">The base uri</param>
        /// <param name="doc">The document to operate on</param>
        internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!string.IsNullOrWhiteSpace(href))
                {
                    // Remove links with javascript: URIs, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        // if the link only contains simple text content, it can be converted to a text node
                        if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text)
                        {
                            var text = doc.CreateTextNode(link.TextContent);
                            link.Parent.ReplaceChild(text, link);
                        }
                        else
                        {
                            // if the link has multiple children, they should all be preserved
                            var container = doc.CreateElement("span");
                            while (link.ChildNodes.Length > 0)
                            {
                                container.AppendChild(link.ChildNodes[0]);
                            }
                            link.Parent.ReplaceChild(container, link);
                        }
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var medias = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img", "picture", "figure", "video", "audio", "source" });

            NodeUtility.ForEachNode(medias, (media_node) => {
                if (media_node is IElement)
                {
                    var media  = media_node as IElement;
                    var src    = media.GetAttribute("src");
                    var poster = media.GetAttribute("poster");
                    var srcset = media.GetAttribute("srcset");

                    if (src != null)
                    {
                        media.SetAttribute("src", uri.ToAbsoluteURI(src));
                    }

                    if (poster != null)
                    {
                        media.SetAttribute("poster", uri.ToAbsoluteURI(poster));
                    }

                    if (srcset != null)
                    {
                        var newSrcset = RE_SrcSetUrl.Replace(srcset, (input) =>
                        {
                            return(uri.ToAbsoluteURI(input.Groups[1].Value) + (input.Groups[2]?.Value ?? "") + input.Groups[3].Value);
                        });

                        media.SetAttribute("srcset", newSrcset);
                    }
                }
            });
        }
Example #13
0
        /// <summary>
        /// Try to extract metadata from JSON-LD object.
        /// For now, only Schema.org objects of type Article or its subtypes are supported.
        /// </summary>
        /// <param name="doc">The document</param>
        /// <returns>Dictionary with any metadata that could be extracted (possibly none)</returns>
        internal static Dictionary <string, string> GetJSONLD(IHtmlDocument doc)
        {
            var jsonLDMetadata = new Dictionary <string, string>();

            var scripts = NodeUtility.GetAllNodesWithTag(doc.DocumentElement, new string[] { "script" });

            var jsonLdElement = NodeUtility.FindNode(scripts, (el) => {
                return(el?.GetAttribute("type") == "application/ld+json");
            });

            if (jsonLdElement != null)
            {
                // Strip CDATA markers if present
                var content = Regex.Replace(jsonLdElement.TextContent, @"^\s*<!\[CDATA\[|\]\]>\$", "");
                try
                {
                    using (JsonDocument document = JsonDocument.Parse(content))
                    {
                        var         root = document.RootElement;
                        JsonElement value;

                        // JsonLD can contain an array of elements inside property @graph
                        if (!root.TryGetProperty("@type", out value) &&
                            root.TryGetProperty("@graph", out value))
                        {
                            var graph = value.EnumerateArray();
                            foreach (var obj in graph)
                            {
                                if (obj.TryGetProperty("@type", out value) &&
                                    RE_JsonLdArticleTypes.IsMatch(value.GetString()))
                                {
                                    root = obj;
                                    break;
                                }
                            }
                        }

                        if (!root.TryGetProperty("@context", out value) ||
                            !Regex.IsMatch(value.GetString(), @"^https?\:\/\/schema\.org$"))
                        {
                            return(jsonLDMetadata);
                        }

                        if (!root.TryGetProperty("@type", out value) ||
                            !RE_JsonLdArticleTypes.IsMatch(value.GetString()))
                        {
                            return(jsonLDMetadata);
                        }

                        if (root.TryGetProperty("name", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:title"] = value.GetString().Trim();
                        }
                        if (root.TryGetProperty("headline", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:title"] = value.GetString().Trim();
                        }
                        if (root.TryGetProperty("author", out value))
                        {
                            if (value.ValueKind == JsonValueKind.Object)
                            {
                                jsonLDMetadata["jsonld:author"] = value.GetProperty("name").GetString().Trim();
                            }
                            else if (value.ValueKind == JsonValueKind.Array &&
                                     value.EnumerateArray().ElementAt(0).GetProperty("name").ValueKind == JsonValueKind.String)
                            {
                                var           authors = root.GetProperty("author").EnumerateArray();
                                List <string> byline  = new List <string>();
                                foreach (var author in authors)
                                {
                                    if (author.TryGetProperty("name", out value) &&
                                        value.ValueKind == JsonValueKind.String)
                                    {
                                        byline.Add(value.GetString().Trim());
                                    }
                                }

                                jsonLDMetadata["jsonld:author"] = String.Join(", ", byline);
                            }
                        }

                        if (root.TryGetProperty("description", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:description"] = value.GetString().Trim();
                        }
                        if (root.TryGetProperty("publisher", out value) &&
                            value.ValueKind == JsonValueKind.Object)
                        {
                            jsonLDMetadata["jsonld:siteName"] = value.GetProperty("name").GetString().Trim();
                        }
                        if (root.TryGetProperty("datePublished", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:datePublished"] = value.GetProperty("datePublished").GetString();
                        }
                        if (root.TryGetProperty("image", out value) &&
                            value.ValueKind == JsonValueKind.String)
                        {
                            jsonLDMetadata["jsonld:image"] = value.GetProperty("image").GetString();
                        }
                    }
                }
                catch (Exception e)
                {
                }
            }
            return(jsonLDMetadata);
        }