protected virtual string ApplyTemplateToDescription(ExportFeedItem item, RssFeed feed, string template) { switch (item.SiteName.ToLower()) { case "youtube": case "youtu.be": template = template.Replace("{class}", ""); template = template.Replace("{allow}", "accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture"); break; case "rumble": template = template.Replace("{class}", "rumble"); template = template.Replace("{allow}", ""); break; case "gab tv": template = template.Replace("{class}", "studio-video"); template = template.Replace("{allow}", ""); break; } var t = new Template(template, '$', '$'); t.Add("item", item); t.Add("feed", feed); t.Add("ArticleText", item.ArticleText); return(t.Render()); }
protected virtual void SetExtendedArticleMetaData(ExportFeedItem exportFeedItem, RssFeedItem item, string hostName) { // Extract the meta data from the Open Graph tags helpfully provided with almost every article string url = exportFeedItem.Url; exportFeedItem.Url = item.OpenGraphAttributes.GetValueOrDefault("og:url") ?? ""; // Make sure the Url is complete if (!exportFeedItem.Url.StartsWith("http")) { exportFeedItem.Url = item.HtmlAttributes.GetValueOrDefault("Url") ?? item.FeedAttributes.Url; } // Extract the meta data from the Open Graph tags exportFeedItem.ArticleText = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? ""; exportFeedItem.Subtitle = item.OpenGraphAttributes.GetValueOrDefault("og:title") ?? null; exportFeedItem.ImageUrl = item.OpenGraphAttributes.GetValueOrDefault("og:image") ?? null; exportFeedItem.SiteName = item.OpenGraphAttributes.GetValueOrDefault("og:site_name")?.ToLower() ?? ""; exportFeedItem.HostName = hostName; // Fixup apnews on populist press links which sometimes report incorrectly if (string.IsNullOrWhiteSpace(exportFeedItem.SiteName) || (exportFeedItem.SiteName == "ap news" && exportFeedItem.Url.Contains("populist.press"))) { exportFeedItem.SiteName = exportFeedItem.HostName; } // Fixup news.trust.org imageUrl links which have an embedded redirect if (string.IsNullOrWhiteSpace(exportFeedItem.ImageUrl) || (exportFeedItem.SiteName == "news.trust.org" && exportFeedItem.Url.Contains("news.trust.org"))) { exportFeedItem.ImageUrl = null; } // Remove the protocol portion if there is one, i.e. 'https://' if (exportFeedItem.SiteName.IndexOf('/') > 0) { exportFeedItem.SiteName = exportFeedItem.SiteName.Substring(exportFeedItem.SiteName.LastIndexOf('/') + 1); } }
public ExportFeedItem FormatItem(RssFeedItem item, RssFeed feed) { // The UrlHash is a hash of the feed source, not the ultimate target URL. This is to avoid // over-crawling with link shortening services such as bit.ly and t.co. Once we detect a hash // has been crawled from the source, there is no need to crawl again. It means the hash does // not truly reflect the target URL, but that's ok as there are duplicate crawls across the // different feeds anyway. var exportFeedItem = new ExportFeedItem { Id = Guid.NewGuid().ToString(), FeedId = item.FeedAttributes.FeedId, Url = GetCanonicalUrl(item), UrlHash = item.FeedAttributes.UrlHash, DateAdded = item.FeedAttributes.DateAdded, LinkLocation = item.FeedAttributes.LinkLocation, Title = item.FeedAttributes.Title }; Uri uri = new Uri(exportFeedItem.Url); string hostName = uri.GetComponents(UriComponents.Host, UriFormat.Unescaped).ToLower(); var fileName = item.FeedAttributes.FileName ?? ""; if (fileName.EndsWith(".png") || fileName.EndsWith(".jpg") || fileName.EndsWith(".gif") || fileName.EndsWith(".pdf")) { SetGraphicMetaData(item, exportFeedItem); exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.GraphicTemplate); return(exportFeedItem); } string videoUrl = item.OpenGraphAttributes.GetValueOrDefault("og:video:secure_url") ?? item.OpenGraphAttributes.GetValueOrDefault("og:video:url") ?? item.OpenGraphAttributes.GetValueOrDefault("og:video") ?? item.OpenGraphAttributes.GetValueOrDefault("og:x:video") ?? ""; // Some sites do not provide OpenGraph video tags so watch for those specifically string videoType = item.OpenGraphAttributes.GetValueOrDefault("og:video:type") ?? item.OpenGraphAttributes.GetValueOrDefault("og:x:video:type") ?? (videoUrl.EndsWith(".mp4") || item.SiteName == "bitchute" ? "video/mp4" : videoUrl.Contains("youtube.com") || item.SiteName == "rumble" ? "text/html" : ""); bool hasSupportedVideoFormat = (videoUrl.Length > 0 || item.SiteName == "rumble" || item.SiteName == "bitchute") && (videoType == "text/html" || videoType == "video/mp4" || videoType == "application/x-mpegURL"); if (hasSupportedVideoFormat) { Log.Debug("Applying video metadata values for '{hostname}'", hostName); SetVideoMetaData(exportFeedItem, item, hostName); if (exportFeedItem.VideoHeight > 0) { if (videoType == "video/mp4" || videoType == "application/x-mpegURL") { exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.Mp4VideoTemplate); } else { exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.HtmlVideoTemplate); } } else { exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.ExtendedTemplate); } } else { var result = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? ""; if (string.IsNullOrEmpty(result)) { Log.Debug("No parsed result, applying basic metadata values for '{hostname}'", hostName); // Article failed to download, display minimal basic meta data SetBasicArticleMetaData(exportFeedItem, item, hostName); exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.BasicTemplate); } else { Log.Debug("Applying extended metadata values for '{hostname}'", hostName); SetExtendedArticleMetaData(exportFeedItem, item, hostName); exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.ExtendedTemplate); } } return(exportFeedItem); }
protected virtual void SetBasicArticleMetaData(ExportFeedItem exportFeedItem, RssFeedItem item, string hostName) { exportFeedItem.HostName = hostName; exportFeedItem.SiteName = hostName; exportFeedItem.ArticleText = $"<p>Unable to crawl article content. Click the link below to view in your browser.</p>"; }
protected virtual void SetGraphicMetaData(RssFeedItem item, ExportFeedItem exportFeedItem) { exportFeedItem.ImageUrl = item.FeedAttributes.Url; exportFeedItem.HostName = item.HostName; exportFeedItem.SiteName = item.HostName; }
protected virtual void SetVideoMetaData(ExportFeedItem exportFeedItem, RssFeedItem item, string hostName) { // Extract the meta data from the Open Graph tags exportFeedItem.Subtitle = item.OpenGraphAttributes.GetValueOrDefault("og:title") ?? ""; exportFeedItem.ImageUrl = item.OpenGraphAttributes.GetValueOrDefault("og:image") ?? ""; exportFeedItem.SiteName = item.OpenGraphAttributes.GetValueOrDefault("og:site_name")?.ToLower() ?? ""; exportFeedItem.HostName = hostName; var description = item.OpenGraphAttributes.GetValueOrDefault("og:description") ?? ""; if (string.IsNullOrWhiteSpace(exportFeedItem.SiteName)) { exportFeedItem.SiteName = hostName; } if (item.SiteName == "rumble") { var text = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? ""; if (!text.StartsWith("<")) { Log.Debug("EXPORT: Processing rumble.com ld+json metadata"); // application/ld+json parser result var list = JsonConvert.DeserializeObject <List <JsonLdRumbleValues> >(text); foreach (var value in list) { if (string.IsNullOrWhiteSpace(value.embedUrl)) { continue; } exportFeedItem.VideoUrl = value.embedUrl; exportFeedItem.VideoHeight = int.TryParse(Convert.ToString(value.height), out int height) ? height : 0; exportFeedItem.VideoWidth = int.TryParse(Convert.ToString(value.width), out int width) ? width : 0; break; } } } else if (item.SiteName == "bitchute") { Log.Information("EXPORT: Processing bitchute.com metadata"); // Bitchute logic is a little convoluted, they don't provide much metadata var result = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? ""; var start = result.IndexOf("https://"); var length = result.IndexOf('\"', start) - start; exportFeedItem.VideoUrl = result.Substring(start, length); exportFeedItem.VideoHeight = 1080; exportFeedItem.VideoWidth = 1920; } else { Log.Debug("EXPORT: Processing open graph video metadata"); // Sites that provide video metadata via open graph tags exportFeedItem.VideoUrl = item.OpenGraphAttributes.GetValueOrDefault("og:video:secure_url") ?? item.OpenGraphAttributes.GetValueOrDefault("og:video:url") ?? item.OpenGraphAttributes.GetValueOrDefault("og:video") ?? item.OpenGraphAttributes.GetValueOrDefault("og:x:video") ?? ""; exportFeedItem.VideoHeight = int.TryParse(item.OpenGraphAttributes.GetValueOrDefault("og:video:height") ?? item.OpenGraphAttributes.GetValueOrDefault("og:x:video:height") ?? item.OpenGraphAttributes.GetValueOrDefault("og:image:height"), out int height) ? height : 0; exportFeedItem.VideoWidth = int.TryParse(item.OpenGraphAttributes.GetValueOrDefault("og:video:width") ?? item.OpenGraphAttributes.GetValueOrDefault("og:x:video:width") ?? item.OpenGraphAttributes.GetValueOrDefault("og:image:width"), out int width) ? width : 0; string result = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? ""; if (!string.IsNullOrEmpty(result)) { description = result; } } using (LogContext.PushProperty("hostName", hostName)) { Log.Information("Video URL: '{url}' ({height}x{width})", exportFeedItem.VideoUrl, exportFeedItem.VideoHeight, exportFeedItem.VideoWidth); } // There's no article text for most video sites, so just use the meta description exportFeedItem.ArticleText = $"<p>{description}</p>"; }