示例#1
0
    protected virtual string ApplyTemplateToDescription(ExportFeedItem item, RssFeed feed, string template)
    {
        switch (item.SiteName.ToLower())
        {
        case "youtube":
        case "youtu.be":
            template = template.Replace("{class}", "");
            template = template.Replace("{allow}", "accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture");
            break;

        case "rumble":
            template = template.Replace("{class}", "rumble");
            template = template.Replace("{allow}", "");
            break;

        case "gab tv":
            template = template.Replace("{class}", "studio-video");
            template = template.Replace("{allow}", "");
            break;
        }

        var t = new Template(template, '$', '$');

        t.Add("item", item);
        t.Add("feed", feed);
        t.Add("ArticleText", item.ArticleText);

        return(t.Render());
    }
示例#2
0
    protected virtual void SetExtendedArticleMetaData(ExportFeedItem exportFeedItem, RssFeedItem item, string hostName)
    {
        // Extract the meta data from the Open Graph tags helpfully provided with almost every article
        string url = exportFeedItem.Url;

        exportFeedItem.Url = item.OpenGraphAttributes.GetValueOrDefault("og:url") ?? "";

        // Make sure the Url is complete
        if (!exportFeedItem.Url.StartsWith("http"))
        {
            exportFeedItem.Url = item.HtmlAttributes.GetValueOrDefault("Url") ?? item.FeedAttributes.Url;
        }

        // Extract the meta data from the Open Graph tags
        exportFeedItem.ArticleText = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? "";
        exportFeedItem.Subtitle    = item.OpenGraphAttributes.GetValueOrDefault("og:title") ?? null;
        exportFeedItem.ImageUrl    = item.OpenGraphAttributes.GetValueOrDefault("og:image") ?? null;
        exportFeedItem.SiteName    = item.OpenGraphAttributes.GetValueOrDefault("og:site_name")?.ToLower() ?? "";
        exportFeedItem.HostName    = hostName;

        // Fixup apnews on populist press links which sometimes report incorrectly
        if (string.IsNullOrWhiteSpace(exportFeedItem.SiteName) || (exportFeedItem.SiteName == "ap news" && exportFeedItem.Url.Contains("populist.press")))
        {
            exportFeedItem.SiteName = exportFeedItem.HostName;
        }

        // Fixup news.trust.org imageUrl links which have an embedded redirect
        if (string.IsNullOrWhiteSpace(exportFeedItem.ImageUrl) || (exportFeedItem.SiteName == "news.trust.org" && exportFeedItem.Url.Contains("news.trust.org")))
        {
            exportFeedItem.ImageUrl = null;
        }

        // Remove the protocol portion if there is one, i.e. 'https://'
        if (exportFeedItem.SiteName.IndexOf('/') > 0)
        {
            exportFeedItem.SiteName = exportFeedItem.SiteName.Substring(exportFeedItem.SiteName.LastIndexOf('/') + 1);
        }
    }
示例#3
0
    public ExportFeedItem FormatItem(RssFeedItem item, RssFeed feed)
    {
        // The UrlHash is a hash of the feed source, not the ultimate target URL. This is to avoid
        // over-crawling with link shortening services such as bit.ly and t.co. Once we detect a hash
        // has been crawled from the source, there is no need to crawl again. It means the hash does
        // not truly reflect the target URL, but that's ok as there are duplicate crawls across the
        // different feeds anyway.
        var exportFeedItem = new ExportFeedItem
        {
            Id           = Guid.NewGuid().ToString(),
            FeedId       = item.FeedAttributes.FeedId,
            Url          = GetCanonicalUrl(item),
            UrlHash      = item.FeedAttributes.UrlHash,
            DateAdded    = item.FeedAttributes.DateAdded,
            LinkLocation = item.FeedAttributes.LinkLocation,
            Title        = item.FeedAttributes.Title
        };

        Uri    uri      = new Uri(exportFeedItem.Url);
        string hostName = uri.GetComponents(UriComponents.Host, UriFormat.Unescaped).ToLower();

        var fileName = item.FeedAttributes.FileName ?? "";

        if (fileName.EndsWith(".png") || fileName.EndsWith(".jpg") || fileName.EndsWith(".gif") || fileName.EndsWith(".pdf"))
        {
            SetGraphicMetaData(item, exportFeedItem);
            exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.GraphicTemplate);
            return(exportFeedItem);
        }

        string videoUrl = item.OpenGraphAttributes.GetValueOrDefault("og:video:secure_url") ??
                          item.OpenGraphAttributes.GetValueOrDefault("og:video:url") ??
                          item.OpenGraphAttributes.GetValueOrDefault("og:video") ??
                          item.OpenGraphAttributes.GetValueOrDefault("og:x:video") ??
                          "";
        // Some sites do not provide OpenGraph video tags so watch for those specifically
        string videoType = item.OpenGraphAttributes.GetValueOrDefault("og:video:type") ??
                           item.OpenGraphAttributes.GetValueOrDefault("og:x:video:type") ??
                           (videoUrl.EndsWith(".mp4") || item.SiteName == "bitchute" ? "video/mp4" :
                            videoUrl.Contains("youtube.com") || item.SiteName == "rumble" ? "text/html" : "");

        bool hasSupportedVideoFormat = (videoUrl.Length > 0 || item.SiteName == "rumble" || item.SiteName == "bitchute") &&
                                       (videoType == "text/html" || videoType == "video/mp4" || videoType == "application/x-mpegURL");

        if (hasSupportedVideoFormat)
        {
            Log.Debug("Applying video metadata values for '{hostname}'", hostName);
            SetVideoMetaData(exportFeedItem, item, hostName);
            if (exportFeedItem.VideoHeight > 0)
            {
                if (videoType == "video/mp4" || videoType == "application/x-mpegURL")
                {
                    exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.Mp4VideoTemplate);
                }
                else
                {
                    exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.HtmlVideoTemplate);
                }
            }
            else
            {
                exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.ExtendedTemplate);
            }
        }
        else
        {
            var result = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? "";
            if (string.IsNullOrEmpty(result))
            {
                Log.Debug("No parsed result, applying basic metadata values for '{hostname}'", hostName);

                // Article failed to download, display minimal basic meta data
                SetBasicArticleMetaData(exportFeedItem, item, hostName);
                exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.BasicTemplate);
            }
            else
            {
                Log.Debug("Applying extended metadata values for '{hostname}'", hostName);

                SetExtendedArticleMetaData(exportFeedItem, item, hostName);
                exportFeedItem.ArticleText = ApplyTemplateToDescription(exportFeedItem, feed, ExportTemplates.ExtendedTemplate);
            }
        }

        return(exportFeedItem);
    }
示例#4
0
 protected virtual void SetBasicArticleMetaData(ExportFeedItem exportFeedItem, RssFeedItem item, string hostName)
 {
     exportFeedItem.HostName    = hostName;
     exportFeedItem.SiteName    = hostName;
     exportFeedItem.ArticleText = $"<p>Unable to crawl article content. Click the link below to view in your browser.</p>";
 }
示例#5
0
 protected virtual void SetGraphicMetaData(RssFeedItem item, ExportFeedItem exportFeedItem)
 {
     exportFeedItem.ImageUrl = item.FeedAttributes.Url;
     exportFeedItem.HostName = item.HostName;
     exportFeedItem.SiteName = item.HostName;
 }
示例#6
0
    protected virtual void SetVideoMetaData(ExportFeedItem exportFeedItem, RssFeedItem item, string hostName)
    {
        // Extract the meta data from the Open Graph tags
        exportFeedItem.Subtitle = item.OpenGraphAttributes.GetValueOrDefault("og:title") ?? "";
        exportFeedItem.ImageUrl = item.OpenGraphAttributes.GetValueOrDefault("og:image") ?? "";
        exportFeedItem.SiteName = item.OpenGraphAttributes.GetValueOrDefault("og:site_name")?.ToLower() ?? "";
        exportFeedItem.HostName = hostName;
        var description = item.OpenGraphAttributes.GetValueOrDefault("og:description") ?? "";

        if (string.IsNullOrWhiteSpace(exportFeedItem.SiteName))
        {
            exportFeedItem.SiteName = hostName;
        }

        if (item.SiteName == "rumble")
        {
            var text = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? "";
            if (!text.StartsWith("<"))
            {
                Log.Debug("EXPORT: Processing rumble.com ld+json metadata");

                // application/ld+json parser result
                var list = JsonConvert.DeserializeObject <List <JsonLdRumbleValues> >(text);
                foreach (var value in list)
                {
                    if (string.IsNullOrWhiteSpace(value.embedUrl))
                    {
                        continue;
                    }

                    exportFeedItem.VideoUrl    = value.embedUrl;
                    exportFeedItem.VideoHeight = int.TryParse(Convert.ToString(value.height), out int height) ? height : 0;
                    exportFeedItem.VideoWidth  = int.TryParse(Convert.ToString(value.width), out int width) ? width : 0;
                    break;
                }
            }
        }
        else if (item.SiteName == "bitchute")
        {
            Log.Information("EXPORT: Processing bitchute.com metadata");

            // Bitchute logic is a little convoluted, they don't provide much metadata
            var result = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? "";
            var start  = result.IndexOf("https://");
            var length = result.IndexOf('\"', start) - start;

            exportFeedItem.VideoUrl    = result.Substring(start, length);
            exportFeedItem.VideoHeight = 1080;
            exportFeedItem.VideoWidth  = 1920;
        }
        else
        {
            Log.Debug("EXPORT: Processing open graph video metadata");

            // Sites that provide video metadata via open graph tags
            exportFeedItem.VideoUrl = item.OpenGraphAttributes.GetValueOrDefault("og:video:secure_url") ??
                                      item.OpenGraphAttributes.GetValueOrDefault("og:video:url") ??
                                      item.OpenGraphAttributes.GetValueOrDefault("og:video") ??
                                      item.OpenGraphAttributes.GetValueOrDefault("og:x:video") ??
                                      "";
            exportFeedItem.VideoHeight = int.TryParse(item.OpenGraphAttributes.GetValueOrDefault("og:video:height") ??
                                                      item.OpenGraphAttributes.GetValueOrDefault("og:x:video:height") ??
                                                      item.OpenGraphAttributes.GetValueOrDefault("og:image:height"), out int height) ? height : 0;
            exportFeedItem.VideoWidth = int.TryParse(item.OpenGraphAttributes.GetValueOrDefault("og:video:width") ??
                                                     item.OpenGraphAttributes.GetValueOrDefault("og:x:video:width") ??
                                                     item.OpenGraphAttributes.GetValueOrDefault("og:image:width"), out int width) ? width : 0;

            string result = item.HtmlAttributes.GetValueOrDefault("ParserResult") ?? "";
            if (!string.IsNullOrEmpty(result))
            {
                description = result;
            }
        }

        using (LogContext.PushProperty("hostName", hostName))
        {
            Log.Information("Video URL: '{url}' ({height}x{width})", exportFeedItem.VideoUrl, exportFeedItem.VideoHeight, exportFeedItem.VideoWidth);
        }

        // There's no article text for most video sites, so just use the meta description
        exportFeedItem.ArticleText = $"<p>{description}</p>";
    }