/// <summary>
        /// Converts each &lt;a&gt; and &lt;img&gt; uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        /// <param name="uri">The base uri</param>
        /// <param name="doc">The document to operate on</param>
        internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!String.IsNullOrWhiteSpace(href))
                {
                    // Remove links with javascript: URIs, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        // if the link only contains simple text content, it can be converted to a text node
                        if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text)
                        {
                            var text = doc.CreateTextNode(link.TextContent);
                            link.Parent.ReplaceChild(text, link);
                        }
                        else
                        {
                            // if the link has multiple children, they should all be preserved
                            var container = doc.CreateElement("span");
                            while (link.ChildNodes.Length > 0)
                            {
                                container.AppendChild(link.ChildNodes[0]);
                            }
                            link.Parent.ReplaceChild(container, link);
                        }
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" });

            NodeUtility.ForEachNode(imgs, (img) =>
            {
                var src = (img as IElement).GetAttribute("src");
                if (!String.IsNullOrWhiteSpace(src))
                {
                    (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src));
                }
            });
        }
Exemple #2
0
        /// <summary>
        /// <para>Get the density of links as a percentage of the content.</para>
        /// <para>This is the amount of text that is inside a link divided by the totaltextinthenode.</para>
        /// </summary>
        /// <param name="element">Element to operate on</param>
        internal static float GetLinkDensity(IElement element)
        {
            var textLength = NodeUtility.GetInnerText(element).Length;

            if (textLength == 0)
            {
                return(0);
            }

            float linkLength = 0;

            // XXX implement _reduceNodeList?
            NodeUtility.ForEachNode(element.GetElementsByTagName("a"), (linkNode) =>
            {
                linkLength += NodeUtility.GetInnerText(linkNode as IElement).Length;
            });

            return(linkLength / textLength);
        }
Exemple #3
0
        /// <summary>
        /// Converts each<a> and<img> uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        public static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!String.IsNullOrWhiteSpace(href))
                {
                    // Replace links with javascript: URIs with text content, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        var text = doc.CreateTextNode(link.TextContent);
                        link.Parent.ReplaceChild(text, link);
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" });

            NodeUtility.ForEachNode(imgs, (img) =>
            {
                var src = (img as IElement).GetAttribute("src");
                if (!String.IsNullOrWhiteSpace(src))
                {
                    (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src));
                }
            });
        }
Exemple #4
0
        /**
         * Attempts to get metadata for the article.
         *
         * @return void
         */
        public static Metadata GetArticleMetadata(IHtmlDocument doc, Uri uri, string language)
        {
            Metadata metadata = new Metadata();
            Dictionary <string, string> values = new Dictionary <string, string>();
            var metaElements = doc.GetElementsByTagName("meta");

            // Match "description", or Twitter's "twitter:description" (Cards)
            // in name attribute.
            // name is a single value
            var namePattern = @"^\s*((?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)|name)\s*$";

            // Match Facebook's Open Graph title & description properties.
            // property is a space-separated list of values
            var propertyPattern = @"\s*(dc|dcterm|og|twitter|article)\s*:\s*(author|creator|description|title|published_time|image|site_name)(\s+|$)";

            var itemPropPattern = @"\s*datePublished\s*";

            // Find description tags.
            NodeUtility.ForEachNode(metaElements, (element) =>
            {
                var elementName     = (element as IElement).GetAttribute("name") ?? "";
                var elementProperty = (element as IElement).GetAttribute("property") ?? "";
                var itemProp        = (element as IElement).GetAttribute("itemprop") ?? "";
                var content         = (element as IElement).GetAttribute("content");

                // avoid issues with no meta tags
                if (String.IsNullOrEmpty(content))
                {
                    return;
                }
                MatchCollection matches = null;
                String name             = "";

                if (new string[] { elementName, elementProperty, itemProp }.ToList().IndexOf("author") != -1)
                {
                    metadata.Byline = (element as IElement).GetAttribute("content");
                    metadata.Author = (element as IElement).GetAttribute("content");
                    return;
                }

                if (!String.IsNullOrEmpty(elementProperty))
                {
                    matches = Regex.Matches(elementProperty, propertyPattern);
                    if (matches.Count > 0)
                    {
                        for (int i = matches.Count - 1; i >= 0; i--)
                        {
                            // Convert to lowercase, and remove any whitespace
                            // so we can match below.
                            name = Regex.Replace(matches[i].Value.ToLower(), @"\s+", "");

                            // multiple authors
                            values[name] = content.Trim();
                        }
                    }
                }

                if ((matches == null || matches.Count == 0) &&
                    !String.IsNullOrEmpty(elementName) && Regex.IsMatch(elementName, namePattern, RegexOptions.IgnoreCase))
                {
                    name = elementName;
                    if (!String.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase, remove any whitespace, and convert dots
                        // to colons so we can match below.
                        name         = Regex.Replace(Regex.Replace(name.ToLower(), @"\s+", ""), @"\.", ":");
                        values[name] = content.Trim();
                    }
                }
                else if (Regex.IsMatch(elementProperty, propertyPattern, RegexOptions.IgnoreCase))
                {
                    name = elementProperty;
                }
                else if (Regex.IsMatch(itemProp, itemPropPattern, RegexOptions.IgnoreCase))
                {
                    name = itemProp;
                }

                if (!String.IsNullOrEmpty(name))
                {
                    content = (element as IElement).GetAttribute("content");
                    if (!String.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase and remove any whitespace
                        // so we can match below.
                        name = Regex.Replace(name.ToLower(), @"\s", "", RegexOptions.IgnoreCase);
                        if (!values.ContainsKey(name))
                        {
                            values.Add(name, content.Trim());
                        }
                    }
                }
            });

            // Find the the description of the article
            IEnumerable <string> DescriptionKeys()
            {
                yield return(values.ContainsKey("description") ? values["description"] : null);

                yield return(values.ContainsKey("dc:description") ? values["dc:description"] : null);

                yield return(values.ContainsKey("dcterm:description") ? values["dcterm:description"] : null);

                yield return(values.ContainsKey("og:description") ? values["og:description"] : null);

                yield return(values.ContainsKey("weibo:article:description") ? values["weibo:article:description"] : null);

                yield return(values.ContainsKey("weibo:webpage:description") ? values["weibo:webpage:description"] : null);

                yield return(values.ContainsKey("twitter:description") ? values["twitter:description"] : null);
            }

            metadata.Excerpt = DescriptionKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Get the name of the site
            if (values.ContainsKey("og:site_name"))
            {
                metadata.SiteName = values["og:site_name"];
            }

            // Find the title of the article
            IEnumerable <string> TitleKeys()
            {
                yield return(values.ContainsKey("dc:title") ? values["dc:title"] : null);

                yield return(values.ContainsKey("dcterm:title") ? values["dcterm:title"] : null);

                yield return(values.ContainsKey("og:title") ? values["og:title"] : null);

                yield return(values.ContainsKey("weibo:article:title") ? values["weibo:article:title"] : null);

                yield return(values.ContainsKey("weibo:webpage:title") ? values["weibo:webpage:title"] : null);

                yield return(values.ContainsKey("twitter:title") ? values["twitter:title"] : null);

                yield return(values.ContainsKey("title") ? values["title"] : null);
            }

            metadata.Title = TitleKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Let's try to eliminate the site name from the title
            metadata.Title = Readability.CleanTitle(metadata.Title, metadata.SiteName);

            // We did not find any title,
            // we try to get it from the title tag
            if (String.IsNullOrEmpty(metadata.Title))
            {
                metadata.Title = Readability.GetArticleTitle(doc);
            }

            // added language extraction
            IEnumerable <string> LanguageHeuristics()
            {
                yield return(language);

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("lang"));

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("xml:lang"));

                yield return(doc.QuerySelector("meta[http-equiv=\"Content-Language\"]")?.GetAttribute("content"));

                // this is wrong, but it's used
                yield return(doc.QuerySelector("meta[name=\"lang\"]")?.GetAttribute("value"));
            }

            metadata.Language = LanguageHeuristics().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";


            // Find the featured image of the article
            IEnumerable <string> FeaturedImageKeys()
            {
                yield return(values.ContainsKey("og:image") ? values["og:image"] : null);

                yield return(values.ContainsKey("twitter:image") ? values["twitter:image"] : null);

                yield return(values.ContainsKey("weibo:article:image") ? values["weibo:article:image"] : null);

                yield return(values.ContainsKey("weibo:webpage:image") ? values["weibo:webpage:image"] : null);
            }

            metadata.FeaturedImage = FeaturedImageKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            if (String.IsNullOrEmpty(metadata.Author))
            {
                // We try to find a meta tag for the author.
                // Note that there is Open Grapg tag for an author,
                // but it usually contains a profile URL of the author.
                // So we do not use it
                IEnumerable <string> AuthorKeys()
                {
                    yield return(values.ContainsKey("dc:creator") ? values["dc:creator"] : null);

                    yield return(values.ContainsKey("dcterm:creator") ? values["dcterm:creator"] : null);

                    yield return(values.ContainsKey("author") ? values["author"] : null);
                }

                metadata.Author = AuthorKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";
            }

            // added date extraction
            DateTime date;

            // added language extraction
            IEnumerable <DateTime?> DateHeuristics()
            {
                yield return(values.ContainsKey("article:published_time") &&
                             DateTime.TryParse(values["article:published_time"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("date") &&
                             DateTime.TryParse(values["date"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("datepublished") &&
                             DateTime.TryParse(values["datepublished"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:article:create_at") &&
                             DateTime.TryParse(values["weibo:article:create_at"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:webpage:create_at") &&
                             DateTime.TryParse(values["weibo:webpage:create_at"], out date) ?
                             date : DateTime.MinValue);
            }

            metadata.PublicationDate = DateHeuristics().FirstOrDefault(d => d != DateTime.MinValue);

            if (metadata.PublicationDate == null)
            {
                var times = doc.GetElementsByTagName("time");

                Console.WriteLine($"times: {times.Length}");

                foreach (var time in times)
                {
                    if (!String.IsNullOrEmpty(time.GetAttribute("pubDate")) &&
                        DateTime.TryParse(time.GetAttribute("datetime"), out date))
                    {
                        metadata.PublicationDate = date;
                    }
                }
            }

            if (metadata.PublicationDate == null)
            {
                // as a last resort check the URL for a date
                Match maybeDate = Regex.Match(uri.PathAndQuery, "/(?<year>[0-9]{4})/(?<month>[0-9]{2})/(?<day>[0-9]{2})?");
                if (maybeDate.Success)
                {
                    metadata.PublicationDate = new DateTime(int.Parse(maybeDate.Groups["year"].Value),
                                                            int.Parse(maybeDate.Groups["month"].Value),
                                                            !String.IsNullOrEmpty(maybeDate.Groups["day"].Value) ? int.Parse(maybeDate.Groups["day"].Value) : 1);
                }
            }

            return(metadata);
        }
Exemple #5
0
        /// <summary>
        /// Converts each &lt;a&gt; and &lt;img&gt; uri in the given element, and its descendants, to an absolute URI,
        /// ignoring #ref URIs.
        /// </summary>
        /// <param name="articleContent">The node in which to fix all relative uri</param>
        /// <param name="uri">The base uri</param>
        /// <param name="doc">The document to operate on</param>
        internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc)
        {
            var scheme   = uri.Scheme;
            var prePath  = uri.GetBase();
            var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1);

            var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" });

            NodeUtility.ForEachNode(links, (link) =>
            {
                var href = (link as IElement).GetAttribute("href");
                if (!string.IsNullOrWhiteSpace(href))
                {
                    // Remove links with javascript: URIs, since
                    // they won't work after scripts have been removed from the page.
                    if (href.IndexOf("javascript:") == 0)
                    {
                        // if the link only contains simple text content, it can be converted to a text node
                        if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text)
                        {
                            var text = doc.CreateTextNode(link.TextContent);
                            link.Parent.ReplaceChild(text, link);
                        }
                        else
                        {
                            // if the link has multiple children, they should all be preserved
                            var container = doc.CreateElement("span");
                            while (link.ChildNodes.Length > 0)
                            {
                                container.AppendChild(link.ChildNodes[0]);
                            }
                            link.Parent.ReplaceChild(container, link);
                        }
                    }
                    else
                    {
                        (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href));
                    }
                }
            });

            var medias = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img", "picture", "figure", "video", "audio", "source" });

            NodeUtility.ForEachNode(medias, (media_node) => {
                if (media_node is IElement)
                {
                    var media  = media_node as IElement;
                    var src    = media.GetAttribute("src");
                    var poster = media.GetAttribute("poster");
                    var srcset = media.GetAttribute("srcset");

                    if (src != null)
                    {
                        media.SetAttribute("src", uri.ToAbsoluteURI(src));
                    }

                    if (poster != null)
                    {
                        media.SetAttribute("poster", uri.ToAbsoluteURI(poster));
                    }

                    if (srcset != null)
                    {
                        var newSrcset = RE_SrcSetUrl.Replace(srcset, (input) =>
                        {
                            return(uri.ToAbsoluteURI(input.Groups[1].Value) + (input.Groups[2]?.Value ?? "") + input.Groups[3].Value);
                        });

                        media.SetAttribute("srcset", newSrcset);
                    }
                }
            });
        }