/// <summary> /// Converts each <a> and <img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> /// <param name="uri">The base uri</param> /// <param name="doc">The document to operate on</param> internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!String.IsNullOrWhiteSpace(href)) { // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { // if the link only contains simple text content, it can be converted to a text node if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { // if the link has multiple children, they should all be preserved var container = doc.CreateElement("span"); while (link.ChildNodes.Length > 0) { container.AppendChild(link.ChildNodes[0]); } link.Parent.ReplaceChild(container, link); } } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" }); NodeUtility.ForEachNode(imgs, (img) => { var src = (img as IElement).GetAttribute("src"); if (!String.IsNullOrWhiteSpace(src)) { (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src)); } }); }
/// <summary> /// <para>Get the density of links as a percentage of the content.</para> /// <para>This is the amount of text that is inside a link divided by the totaltextinthenode.</para> /// </summary> /// <param name="element">Element to operate on</param> internal static float GetLinkDensity(IElement element) { var textLength = NodeUtility.GetInnerText(element).Length; if (textLength == 0) { return(0); } float linkLength = 0; // XXX implement _reduceNodeList? NodeUtility.ForEachNode(element.GetElementsByTagName("a"), (linkNode) => { linkLength += NodeUtility.GetInnerText(linkNode as IElement).Length; }); return(linkLength / textLength); }
/// <summary> /// Converts each<a> and<img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> public static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!String.IsNullOrWhiteSpace(href)) { // Replace links with javascript: URIs with text content, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" }); NodeUtility.ForEachNode(imgs, (img) => { var src = (img as IElement).GetAttribute("src"); if (!String.IsNullOrWhiteSpace(src)) { (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src)); } }); }
/** * Attempts to get metadata for the article. * * @return void */ public static Metadata GetArticleMetadata(IHtmlDocument doc, Uri uri, string language) { Metadata metadata = new Metadata(); Dictionary <string, string> values = new Dictionary <string, string>(); var metaElements = doc.GetElementsByTagName("meta"); // Match "description", or Twitter's "twitter:description" (Cards) // in name attribute. // name is a single value var namePattern = @"^\s*((?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)|name)\s*$"; // Match Facebook's Open Graph title & description properties. // property is a space-separated list of values var propertyPattern = @"\s*(dc|dcterm|og|twitter|article)\s*:\s*(author|creator|description|title|published_time|image|site_name)(\s+|$)"; var itemPropPattern = @"\s*datePublished\s*"; // Find description tags. NodeUtility.ForEachNode(metaElements, (element) => { var elementName = (element as IElement).GetAttribute("name") ?? ""; var elementProperty = (element as IElement).GetAttribute("property") ?? ""; var itemProp = (element as IElement).GetAttribute("itemprop") ?? ""; var content = (element as IElement).GetAttribute("content"); // avoid issues with no meta tags if (String.IsNullOrEmpty(content)) { return; } MatchCollection matches = null; String name = ""; if (new string[] { elementName, elementProperty, itemProp }.ToList().IndexOf("author") != -1) { metadata.Byline = (element as IElement).GetAttribute("content"); metadata.Author = (element as IElement).GetAttribute("content"); return; } if (!String.IsNullOrEmpty(elementProperty)) { matches = Regex.Matches(elementProperty, propertyPattern); if (matches.Count > 0) { for (int i = matches.Count - 1; i >= 0; i--) { // Convert to lowercase, and remove any whitespace // so we can match below. name = Regex.Replace(matches[i].Value.ToLower(), @"\s+", ""); // multiple authors values[name] = content.Trim(); } } } if ((matches == null || matches.Count == 0) && !String.IsNullOrEmpty(elementName) && Regex.IsMatch(elementName, namePattern, RegexOptions.IgnoreCase)) { name = elementName; if (!String.IsNullOrEmpty(content)) { // Convert to lowercase, remove any whitespace, and convert dots // to colons so we can match below. name = Regex.Replace(Regex.Replace(name.ToLower(), @"\s+", ""), @"\.", ":"); values[name] = content.Trim(); } } else if (Regex.IsMatch(elementProperty, propertyPattern, RegexOptions.IgnoreCase)) { name = elementProperty; } else if (Regex.IsMatch(itemProp, itemPropPattern, RegexOptions.IgnoreCase)) { name = itemProp; } if (!String.IsNullOrEmpty(name)) { content = (element as IElement).GetAttribute("content"); if (!String.IsNullOrEmpty(content)) { // Convert to lowercase and remove any whitespace // so we can match below. name = Regex.Replace(name.ToLower(), @"\s", "", RegexOptions.IgnoreCase); if (!values.ContainsKey(name)) { values.Add(name, content.Trim()); } } } }); // Find the the description of the article IEnumerable <string> DescriptionKeys() { yield return(values.ContainsKey("description") ? values["description"] : null); yield return(values.ContainsKey("dc:description") ? values["dc:description"] : null); yield return(values.ContainsKey("dcterm:description") ? values["dcterm:description"] : null); yield return(values.ContainsKey("og:description") ? values["og:description"] : null); yield return(values.ContainsKey("weibo:article:description") ? values["weibo:article:description"] : null); yield return(values.ContainsKey("weibo:webpage:description") ? values["weibo:webpage:description"] : null); yield return(values.ContainsKey("twitter:description") ? values["twitter:description"] : null); } metadata.Excerpt = DescriptionKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; // Get the name of the site if (values.ContainsKey("og:site_name")) { metadata.SiteName = values["og:site_name"]; } // Find the title of the article IEnumerable <string> TitleKeys() { yield return(values.ContainsKey("dc:title") ? values["dc:title"] : null); yield return(values.ContainsKey("dcterm:title") ? values["dcterm:title"] : null); yield return(values.ContainsKey("og:title") ? values["og:title"] : null); yield return(values.ContainsKey("weibo:article:title") ? values["weibo:article:title"] : null); yield return(values.ContainsKey("weibo:webpage:title") ? values["weibo:webpage:title"] : null); yield return(values.ContainsKey("twitter:title") ? values["twitter:title"] : null); yield return(values.ContainsKey("title") ? values["title"] : null); } metadata.Title = TitleKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; // Let's try to eliminate the site name from the title metadata.Title = Readability.CleanTitle(metadata.Title, metadata.SiteName); // We did not find any title, // we try to get it from the title tag if (String.IsNullOrEmpty(metadata.Title)) { metadata.Title = Readability.GetArticleTitle(doc); } // added language extraction IEnumerable <string> LanguageHeuristics() { yield return(language); yield return(doc.GetElementsByTagName("html")[0].GetAttribute("lang")); yield return(doc.GetElementsByTagName("html")[0].GetAttribute("xml:lang")); yield return(doc.QuerySelector("meta[http-equiv=\"Content-Language\"]")?.GetAttribute("content")); // this is wrong, but it's used yield return(doc.QuerySelector("meta[name=\"lang\"]")?.GetAttribute("value")); } metadata.Language = LanguageHeuristics().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; // Find the featured image of the article IEnumerable <string> FeaturedImageKeys() { yield return(values.ContainsKey("og:image") ? values["og:image"] : null); yield return(values.ContainsKey("twitter:image") ? values["twitter:image"] : null); yield return(values.ContainsKey("weibo:article:image") ? values["weibo:article:image"] : null); yield return(values.ContainsKey("weibo:webpage:image") ? values["weibo:webpage:image"] : null); } metadata.FeaturedImage = FeaturedImageKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; if (String.IsNullOrEmpty(metadata.Author)) { // We try to find a meta tag for the author. // Note that there is Open Grapg tag for an author, // but it usually contains a profile URL of the author. // So we do not use it IEnumerable <string> AuthorKeys() { yield return(values.ContainsKey("dc:creator") ? values["dc:creator"] : null); yield return(values.ContainsKey("dcterm:creator") ? values["dcterm:creator"] : null); yield return(values.ContainsKey("author") ? values["author"] : null); } metadata.Author = AuthorKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; } // added date extraction DateTime date; // added language extraction IEnumerable <DateTime?> DateHeuristics() { yield return(values.ContainsKey("article:published_time") && DateTime.TryParse(values["article:published_time"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("date") && DateTime.TryParse(values["date"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("datepublished") && DateTime.TryParse(values["datepublished"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("weibo:article:create_at") && DateTime.TryParse(values["weibo:article:create_at"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("weibo:webpage:create_at") && DateTime.TryParse(values["weibo:webpage:create_at"], out date) ? date : DateTime.MinValue); } metadata.PublicationDate = DateHeuristics().FirstOrDefault(d => d != DateTime.MinValue); if (metadata.PublicationDate == null) { var times = doc.GetElementsByTagName("time"); Console.WriteLine($"times: {times.Length}"); foreach (var time in times) { if (!String.IsNullOrEmpty(time.GetAttribute("pubDate")) && DateTime.TryParse(time.GetAttribute("datetime"), out date)) { metadata.PublicationDate = date; } } } if (metadata.PublicationDate == null) { // as a last resort check the URL for a date Match maybeDate = Regex.Match(uri.PathAndQuery, "/(?<year>[0-9]{4})/(?<month>[0-9]{2})/(?<day>[0-9]{2})?"); if (maybeDate.Success) { metadata.PublicationDate = new DateTime(int.Parse(maybeDate.Groups["year"].Value), int.Parse(maybeDate.Groups["month"].Value), !String.IsNullOrEmpty(maybeDate.Groups["day"].Value) ? int.Parse(maybeDate.Groups["day"].Value) : 1); } } return(metadata); }
/// <summary> /// Converts each <a> and <img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> /// <param name="uri">The base uri</param> /// <param name="doc">The document to operate on</param> internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { // if the link only contains simple text content, it can be converted to a text node if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { // if the link has multiple children, they should all be preserved var container = doc.CreateElement("span"); while (link.ChildNodes.Length > 0) { container.AppendChild(link.ChildNodes[0]); } link.Parent.ReplaceChild(container, link); } } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var medias = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img", "picture", "figure", "video", "audio", "source" }); NodeUtility.ForEachNode(medias, (media_node) => { if (media_node is IElement) { var media = media_node as IElement; var src = media.GetAttribute("src"); var poster = media.GetAttribute("poster"); var srcset = media.GetAttribute("srcset"); if (src != null) { media.SetAttribute("src", uri.ToAbsoluteURI(src)); } if (poster != null) { media.SetAttribute("poster", uri.ToAbsoluteURI(poster)); } if (srcset != null) { var newSrcset = RE_SrcSetUrl.Replace(srcset, (input) => { return(uri.ToAbsoluteURI(input.Groups[1].Value) + (input.Groups[2]?.Value ?? "") + input.Groups[3].Value); }); media.SetAttribute("srcset", newSrcset); } } }); }