public static async Task <string> ProcessContent(this WebsiteCfg cfg, string html, string url) { if (cfg == null) { if (Svc <ImportPlugin> .Plugin.ImportConfig.UseDefaultHtmlFilter == false) { return(html); } var article = SmartReader.Reader.ParseArticle(url, html, null); if (article.IsReadable == false) { return(html); } return($"<h2>{article.Title}</h2><br/>\n" + article.Content); } html = cfg.ApplyFilter(html); var baseUrl = new Uri(url); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); htmlDoc.EnsureAbsoluteLinks(baseUrl); if (cfg.LoadIframes) { await htmlDoc.InlineIFrames(baseUrl, iframeUrl => iframeUrl.SafeGetStringAsync()).ConfigureAwait(false); } return(htmlDoc.ToHtml()); }
public static ElementBuilder ConfigureWeb( this ElementBuilder builder, WebsiteCfg cfg, IElement fallbackRoot = null, int fallbackPriority = ImportConst.DefaultPriority) { return(builder.WithParent(cfg?.RootElement ?? fallbackRoot) .WithPriority(cfg?.Priority ?? fallbackPriority)); }
public static References ConfigureWeb( this References r, WebsiteCfg cfg, string html, string fallbackDate = null, string fallbackTitle = null) { return(r.WithDate(cfg?.ParseDateString(html) ?? fallbackDate) .WithTitle(cfg?.ParseTitle(html) ?? fallbackTitle) .WithSource(cfg?.Name)); }
public static string ApplyFilter(this WebsiteCfg cfg, string html, string separator = "\r\n") { if (cfg?.Filters.None() ?? true) { return(html); } var matches = cfg.Filters .Select(f => f.Filter(html)) .Where(s => string.IsNullOrWhiteSpace(s) == false); return(string.Join(separator, matches)); }
/// <inheritdoc /> public FeedItemExt(FeedItem feedItem, WebsiteCfg webCfg) { WebCfg = webCfg; Author = feedItem.Author; Categories = feedItem.Categories; Content = feedItem.Content; Description = feedItem.Description; Id = feedItem.Id; Link = feedItem.Link; PublishingDate = feedItem.PublishingDate; PublishingDateString = feedItem.PublishingDateString; SpecificItem = feedItem.SpecificItem; Title = feedItem.Title; }
public static IFlurlRequest CreateRequest(this WebsiteCfg cfg, string url, IFlurlClient client = null) { var req = url.CreateRequest(client, cfg.UserAgent); try { if (string.IsNullOrWhiteSpace(cfg.Cookie) == false) { req.WithCookies(CookiesUtils.ParseCookies(cfg.Cookie, true)); } } catch (Exception ex) { LogTo.Error(ex, $"Exception while parsing cookies: {cfg.Cookie}"); } return(req); }
public static async Task <string> ProcessContent(this WebsiteCfg cfg, string html, string url) { if (cfg == null) { return(html); } html = cfg.ApplyFilter(html); var baseUrl = new Uri(url); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); htmlDoc.EnsureAbsoluteLinks(baseUrl); if (cfg.LoadIframes) { await htmlDoc.InlineIFrames(baseUrl, iframeUrl => iframeUrl.SafeGetStringAsync()); } return(htmlDoc.ToHtml()); }
private async Task <FeedItemExt> DownloadFeedContentAsync(FeedCfg feedCfg, FeedItem feedItem) { WebsiteCfg webCfg = null; try { if (feedItem.Link != null) { webCfg = WebsitesConfig.FindConfig(feedItem.Link); feedItem.Link = feedItem.MakeLink(webCfg); } // // Check & update publishing dates if (feedCfg.UsePubDate) { if (feedItem.PublishingDate == null) { LogTo.Warning("Date missing, or unknown format for feed {Name}, item title '{Title}', raw date '{PublishingDateString}'", feedCfg.Name, feedItem.Title, feedItem.PublishingDateString); return(null); } if (feedItem.PublishingDate <= feedCfg.LastPubDate) { return(null); } } // // Check guid if (feedCfg.UseGuid && feedCfg.EntriesGuid.Contains(feedItem.Id)) { return(null); } // // Check categories if (feedCfg.ShouldExcludeCategory(feedItem)) { return(null); } // // Download content or use inline content if (feedItem.Link != null) { var httpReq = webCfg?.CreateRequest( feedItem.Link, string.IsNullOrWhiteSpace(webCfg.Cookie) ? null : new FlurlClient() /*.Configure(s => s.CookiesEnabled = false)*/) ?? feedItem.Link.CreateRequest(); var httpResp = await httpReq.GetStringAsync().ConfigureAwait(false); if (httpResp != null) { feedItem.Content = httpResp; } else { feedItem.Content = null; LogTo.Warning("Failed to download content for feed {Name}, item title '{Title}', link '{Link}'.", feedCfg.Name, feedItem.Title, feedItem.Link); } } else { feedItem.Content ??= feedItem.Description; } if (string.IsNullOrWhiteSpace(feedItem.Content)) { return(null); } // // Process content if necessary & push back feedItem.Content = await webCfg.ProcessContent(feedItem.Content, feedItem.Link).ConfigureAwait(false); // Add feed item return(new FeedItemExt(feedItem, webCfg)); } catch (UriFormatException ex) { LogTo.Warning(ex, "Invalid content URI in feed {Name}, item title '{Title}', link '{Link}'.", feedCfg.Name, feedItem.Title, feedItem.Link); } catch (FlurlHttpException ex) { LogTo.Warning(ex, "Failed to download content for feed {Name}, item title '{Title}', link '{Link}'.", feedCfg.Name, feedItem.Title, feedItem.Link); } catch (Exception ex) { LogTo.Error(ex, "Exception while downloading content for feed {Name}, item title '{Title}', link '{Link}'", feedCfg.Name, feedItem.Title, feedItem.Link); } return(null); }
public static string ParseDateString(this WebsiteCfg cfg, string html) { return(cfg?.DateRegex?.Match(html).Groups.SafeGet(1)); }
public static string MakeLink(this FeedItem feedItem, WebsiteCfg webCfg) { return(feedItem.Link + (webCfg?.LinkParameter ?? string.Empty)); }