示例#1
0
        public static async Task <string> ProcessContent(this WebsiteCfg cfg, string html, string url)
        {
            if (cfg == null)
            {
                if (Svc <ImportPlugin> .Plugin.ImportConfig.UseDefaultHtmlFilter == false)
                {
                    return(html);
                }

                var article = SmartReader.Reader.ParseArticle(url, html, null);

                if (article.IsReadable == false)
                {
                    return(html);
                }

                return($"<h2>{article.Title}</h2><br/>\n" + article.Content);
            }

            html = cfg.ApplyFilter(html);

            var baseUrl = new Uri(url);
            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(html);

            htmlDoc.EnsureAbsoluteLinks(baseUrl);

            if (cfg.LoadIframes)
            {
                await htmlDoc.InlineIFrames(baseUrl, iframeUrl => iframeUrl.SafeGetStringAsync()).ConfigureAwait(false);
            }

            return(htmlDoc.ToHtml());
        }
示例#2
0
 public static ElementBuilder ConfigureWeb(
     this ElementBuilder builder,
     WebsiteCfg cfg,
     IElement fallbackRoot = null,
     int fallbackPriority  = ImportConst.DefaultPriority)
 {
     return(builder.WithParent(cfg?.RootElement ?? fallbackRoot)
            .WithPriority(cfg?.Priority ?? fallbackPriority));
 }
示例#3
0
 public static References ConfigureWeb(
     this References r,
     WebsiteCfg cfg,
     string html,
     string fallbackDate  = null,
     string fallbackTitle = null)
 {
     return(r.WithDate(cfg?.ParseDateString(html) ?? fallbackDate)
            .WithTitle(cfg?.ParseTitle(html) ?? fallbackTitle)
            .WithSource(cfg?.Name));
 }
示例#4
0
        public static string ApplyFilter(this WebsiteCfg cfg, string html, string separator = "\r\n")
        {
            if (cfg?.Filters.None() ?? true)
            {
                return(html);
            }

            var matches = cfg.Filters
                          .Select(f => f.Filter(html))
                          .Where(s => string.IsNullOrWhiteSpace(s) == false);

            return(string.Join(separator, matches));
        }
示例#5
0
 /// <inheritdoc />
 public FeedItemExt(FeedItem feedItem, WebsiteCfg webCfg)
 {
     WebCfg               = webCfg;
     Author               = feedItem.Author;
     Categories           = feedItem.Categories;
     Content              = feedItem.Content;
     Description          = feedItem.Description;
     Id                   = feedItem.Id;
     Link                 = feedItem.Link;
     PublishingDate       = feedItem.PublishingDate;
     PublishingDateString = feedItem.PublishingDateString;
     SpecificItem         = feedItem.SpecificItem;
     Title                = feedItem.Title;
 }
示例#6
0
        public static IFlurlRequest CreateRequest(this WebsiteCfg cfg, string url, IFlurlClient client = null)
        {
            var req = url.CreateRequest(client, cfg.UserAgent);

            try
            {
                if (string.IsNullOrWhiteSpace(cfg.Cookie) == false)
                {
                    req.WithCookies(CookiesUtils.ParseCookies(cfg.Cookie, true));
                }
            }
            catch (Exception ex)
            {
                LogTo.Error(ex, $"Exception while parsing cookies: {cfg.Cookie}");
            }

            return(req);
        }
        public static async Task <string> ProcessContent(this WebsiteCfg cfg, string html, string url)
        {
            if (cfg == null)
            {
                return(html);
            }

            html = cfg.ApplyFilter(html);

            var baseUrl = new Uri(url);
            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(html);

            htmlDoc.EnsureAbsoluteLinks(baseUrl);

            if (cfg.LoadIframes)
            {
                await htmlDoc.InlineIFrames(baseUrl, iframeUrl => iframeUrl.SafeGetStringAsync());
            }

            return(htmlDoc.ToHtml());
        }
示例#8
0
        private async Task <FeedItemExt> DownloadFeedContentAsync(FeedCfg feedCfg,
                                                                  FeedItem feedItem)
        {
            WebsiteCfg webCfg = null;

            try
            {
                if (feedItem.Link != null)
                {
                    webCfg        = WebsitesConfig.FindConfig(feedItem.Link);
                    feedItem.Link = feedItem.MakeLink(webCfg);
                }

                //
                // Check & update publishing dates

                if (feedCfg.UsePubDate)
                {
                    if (feedItem.PublishingDate == null)
                    {
                        LogTo.Warning("Date missing, or unknown format for feed {Name}, item title '{Title}', raw date '{PublishingDateString}'",
                                      feedCfg.Name, feedItem.Title, feedItem.PublishingDateString);
                        return(null);
                    }

                    if (feedItem.PublishingDate <= feedCfg.LastPubDate)
                    {
                        return(null);
                    }
                }

                //
                // Check guid

                if (feedCfg.UseGuid && feedCfg.EntriesGuid.Contains(feedItem.Id))
                {
                    return(null);
                }

                //
                // Check categories

                if (feedCfg.ShouldExcludeCategory(feedItem))
                {
                    return(null);
                }

                //
                // Download content or use inline content

                if (feedItem.Link != null)
                {
                    var httpReq = webCfg?.CreateRequest(
                        feedItem.Link,
                        string.IsNullOrWhiteSpace(webCfg.Cookie) ? null : new FlurlClient() /*.Configure(s => s.CookiesEnabled = false)*/)
                                  ?? feedItem.Link.CreateRequest();

                    var httpResp = await httpReq.GetStringAsync().ConfigureAwait(false);

                    if (httpResp != null)
                    {
                        feedItem.Content = httpResp;
                    }
                    else
                    {
                        feedItem.Content = null;
                        LogTo.Warning("Failed to download content for feed {Name}, item title '{Title}', link '{Link}'.", feedCfg.Name,
                                      feedItem.Title, feedItem.Link);
                    }
                }

                else
                {
                    feedItem.Content ??= feedItem.Description;
                }

                if (string.IsNullOrWhiteSpace(feedItem.Content))
                {
                    return(null);
                }

                //
                // Process content if necessary & push back

                feedItem.Content = await webCfg.ProcessContent(feedItem.Content, feedItem.Link).ConfigureAwait(false);

                // Add feed item
                return(new FeedItemExt(feedItem, webCfg));
            }
            catch (UriFormatException ex)
            {
                LogTo.Warning(ex, "Invalid content URI in feed {Name}, item title '{Title}', link '{Link}'.", feedCfg.Name,
                              feedItem.Title, feedItem.Link);
            }
            catch (FlurlHttpException ex)
            {
                LogTo.Warning(ex, "Failed to download content for feed {Name}, item title '{Title}', link '{Link}'.", feedCfg.Name,
                              feedItem.Title, feedItem.Link);
            }
            catch (Exception ex)
            {
                LogTo.Error(ex, "Exception while downloading content for feed {Name}, item title '{Title}', link '{Link}'", feedCfg.Name,
                            feedItem.Title, feedItem.Link);
            }

            return(null);
        }
示例#9
0
 public static string ParseDateString(this WebsiteCfg cfg, string html)
 {
     return(cfg?.DateRegex?.Match(html).Groups.SafeGet(1));
 }
示例#10
0
 public static string MakeLink(this FeedItem feedItem, WebsiteCfg webCfg)
 {
     return(feedItem.Link + (webCfg?.LinkParameter ?? string.Empty));
 }