/// <summary> /// Attempts to get metadata for the article. /// </summary> /// <param name="doc">The document</param> /// <param name="uri">The uri, possibly used to check for a date</param> /// <param name="language">The language that was possibly found in the headers of the response</param> /// <returns>The metadata object with all the info found</returns> internal static Metadata GetArticleMetadata(IHtmlDocument doc, Uri uri, string language) { Metadata metadata = new Metadata(); Dictionary <string, string> values = new Dictionary <string, string>(); var metaElements = doc.GetElementsByTagName("meta"); // Match "description", or Twitter's "twitter:description" (Cards) // in name attribute. // name is a single value var namePattern = @"^\s*((?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)|name)\s*$"; // Match Facebook's Open Graph title & description properties. // property is a space-separated list of values var propertyPattern = @"\s*(dc|dcterm|og|twitter|article)\s*:\s*(author|creator|description|title|published_time|image|site_name)(\s+|$)"; var itemPropPattern = @"\s*datePublished\s*"; // Find description tags. NodeUtility.ForEachNode(metaElements, (element) => { var elementName = (element as IElement).GetAttribute("name") ?? ""; var elementProperty = (element as IElement).GetAttribute("property") ?? ""; var itemProp = (element as IElement).GetAttribute("itemprop") ?? ""; var content = (element as IElement).GetAttribute("content"); // avoid issues with no meta tags if (String.IsNullOrEmpty(content)) { return; } MatchCollection matches = null; String name = ""; if (new string[] { elementName, elementProperty, itemProp }.ToList().IndexOf("author") != -1) { metadata.Byline = (element as IElement).GetAttribute("content"); metadata.Author = (element as IElement).GetAttribute("content"); return; } if (!String.IsNullOrEmpty(elementProperty)) { matches = Regex.Matches(elementProperty, propertyPattern); if (matches.Count > 0) { for (int i = matches.Count - 1; i >= 0; i--) { // Convert to lowercase, and remove any whitespace // so we can match below. name = Regex.Replace(matches[i].Value.ToLower(), @"\s+", ""); // multiple authors values[name] = content.Trim(); } } } if ((matches == null || matches.Count == 0) && !String.IsNullOrEmpty(elementName) && Regex.IsMatch(elementName, namePattern, RegexOptions.IgnoreCase)) { name = elementName; if (!String.IsNullOrEmpty(content)) { // Convert to lowercase, remove any whitespace, and convert dots // to colons so we can match below. name = Regex.Replace(Regex.Replace(name.ToLower(), @"\s+", ""), @"\.", ":"); values[name] = content.Trim(); } } else if (Regex.IsMatch(elementProperty, propertyPattern, RegexOptions.IgnoreCase)) { name = elementProperty; } else if (Regex.IsMatch(itemProp, itemPropPattern, RegexOptions.IgnoreCase)) { name = itemProp; } if (!String.IsNullOrEmpty(name)) { content = (element as IElement).GetAttribute("content"); if (!String.IsNullOrEmpty(content)) { // Convert to lowercase and remove any whitespace // so we can match below. name = Regex.Replace(name.ToLower(), @"\s", "", RegexOptions.IgnoreCase); if (!values.ContainsKey(name)) { values.Add(name, content.Trim()); } } } }); // Find the the description of the article IEnumerable <string> DescriptionKeys() { yield return(values.ContainsKey("description") ? values["description"] : null); yield return(values.ContainsKey("dc:description") ? values["dc:description"] : null); yield return(values.ContainsKey("dcterm:description") ? values["dcterm:description"] : null); yield return(values.ContainsKey("og:description") ? values["og:description"] : null); yield return(values.ContainsKey("weibo:article:description") ? values["weibo:article:description"] : null); yield return(values.ContainsKey("weibo:webpage:description") ? values["weibo:webpage:description"] : null); yield return(values.ContainsKey("twitter:description") ? values["twitter:description"] : null); } metadata.Excerpt = DescriptionKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; // Get the name of the site if (values.ContainsKey("og:site_name")) { metadata.SiteName = values["og:site_name"]; } // Find the title of the article IEnumerable <string> TitleKeys() { yield return(values.ContainsKey("dc:title") ? values["dc:title"] : null); yield return(values.ContainsKey("dcterm:title") ? values["dcterm:title"] : null); yield return(values.ContainsKey("og:title") ? values["og:title"] : null); yield return(values.ContainsKey("weibo:article:title") ? values["weibo:article:title"] : null); yield return(values.ContainsKey("weibo:webpage:title") ? values["weibo:webpage:title"] : null); yield return(values.ContainsKey("twitter:title") ? values["twitter:title"] : null); yield return(values.ContainsKey("title") ? values["title"] : null); } metadata.Title = TitleKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; // Let's try to eliminate the site name from the title metadata.Title = Readability.CleanTitle(metadata.Title, metadata.SiteName); // We did not find any title, // we try to get it from the title tag if (String.IsNullOrEmpty(metadata.Title)) { metadata.Title = Readability.GetArticleTitle(doc); } // added language extraction IEnumerable <string> LanguageHeuristics() { yield return(language); yield return(doc.GetElementsByTagName("html")[0].GetAttribute("lang")); yield return(doc.GetElementsByTagName("html")[0].GetAttribute("xml:lang")); yield return(doc.QuerySelector("meta[http-equiv=\"Content-Language\"]")?.GetAttribute("content")); // this is wrong, but it's used yield return(doc.QuerySelector("meta[name=\"lang\"]")?.GetAttribute("value")); } metadata.Language = LanguageHeuristics().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; // Find the featured image of the article IEnumerable <string> FeaturedImageKeys() { yield return(values.ContainsKey("og:image") ? values["og:image"] : null); yield return(values.ContainsKey("twitter:image") ? values["twitter:image"] : null); yield return(values.ContainsKey("weibo:article:image") ? values["weibo:article:image"] : null); yield return(values.ContainsKey("weibo:webpage:image") ? values["weibo:webpage:image"] : null); } metadata.FeaturedImage = FeaturedImageKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; if (String.IsNullOrEmpty(metadata.Author)) { // We try to find a meta tag for the author. // Note that there is Open Grapg tag for an author, // but it usually contains a profile URL of the author. // So we do not use it IEnumerable <string> AuthorKeys() { yield return(values.ContainsKey("dc:creator") ? values["dc:creator"] : null); yield return(values.ContainsKey("dcterm:creator") ? values["dcterm:creator"] : null); yield return(values.ContainsKey("author") ? values["author"] : null); } metadata.Author = AuthorKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? ""; } // added date extraction DateTime date; // added language extraction IEnumerable <DateTime?> DateHeuristics() { yield return(values.ContainsKey("article:published_time") && DateTime.TryParse(values["article:published_time"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("date") && DateTime.TryParse(values["date"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("datepublished") && DateTime.TryParse(values["datepublished"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("weibo:article:create_at") && DateTime.TryParse(values["weibo:article:create_at"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("weibo:webpage:create_at") && DateTime.TryParse(values["weibo:webpage:create_at"], out date) ? date : DateTime.MinValue); } metadata.PublicationDate = DateHeuristics().FirstOrDefault(d => d != DateTime.MinValue); if (metadata.PublicationDate == null) { var times = doc.GetElementsByTagName("time"); foreach (var time in times) { if (!String.IsNullOrEmpty(time.GetAttribute("pubDate")) && DateTime.TryParse(time.GetAttribute("datetime"), out date)) { metadata.PublicationDate = date; } } } if (metadata.PublicationDate == null) { // as a last resort check the URL for a date Match maybeDate = Regex.Match(uri.PathAndQuery, "/(?<year>[0-9]{4})/(?<month>[0-9]{2})/(?<day>[0-9]{2})?"); if (maybeDate.Success) { metadata.PublicationDate = new DateTime(int.Parse(maybeDate.Groups["year"].Value), int.Parse(maybeDate.Groups["month"].Value), !String.IsNullOrEmpty(maybeDate.Groups["day"].Value) ? int.Parse(maybeDate.Groups["day"].Value) : 1); } } return(metadata); }
public static async Task <IList <Uri> > GetFeedUrls( string originUrl, bool findAll = false) { var allUrls = new List <Uri>(); Uri baseUri = FixupUrl(originUrl); // Maybe... maybe this one is a feed? Log.FindFeedCheckingBase(baseUri); string data = await GetFeedData(baseUri); if (LooksLikeFeed(data)) { Log.FindFeedBaseWasFeed(baseUri); return(new[] { baseUri }); } // Nope, let's dive into the soup! var parser = new HtmlParser(); IHtmlDocument document = parser.ParseDocument(data); // Link elements. Log.FindFeedCheckingLinkElements(baseUri); List <Uri> linkUrls = new List <Uri>(); foreach (IElement element in document.GetElementsByTagName("link")) { string linkType = element.GetAttribute("type"); if (linkType != null && FeedMimeTypes.Contains(linkType)) { Uri hrefUrl = SyndicationUtil.TryParseAbsoluteUrl( element.GetAttribute("href"), baseUri ); if (hrefUrl != null) { linkUrls.Add(hrefUrl); } } } await FilterUrlsByFeed(linkUrls); if (linkUrls.Count > 0) { Log.FindFeedFoundLinkElements(baseUri, linkUrls); linkUrls.Sort(UrlFeedComparison); allUrls.AddRange(linkUrls); if (!findAll) { return(allUrls); } } // <a> tags Log.FindFeedCheckingAnchorElements(baseUri); List <Uri> localGuesses = new List <Uri>(); List <Uri> remoteGuesses = new List <Uri>(); foreach (IElement element in document.GetElementsByTagName("a")) { Uri hrefUrl = SyndicationUtil.TryParseAbsoluteUrl( element.GetAttribute("href"), baseUri ); if (hrefUrl != null) { if ((hrefUrl.Host == baseUri.Host) && IsFeedUrl(hrefUrl)) { localGuesses.Add(hrefUrl); } else if (IsFeedishUrl(hrefUrl)) { remoteGuesses.Add(hrefUrl); } } } Log.FindFeedFoundSomeAnchors(baseUri, localGuesses, remoteGuesses); // (Consider ones on the same domain first.) await FilterUrlsByFeed(localGuesses); if (localGuesses.Count > 0) { Log.FindFeedsFoundLocalGuesses(baseUri, localGuesses); localGuesses.Sort(UrlFeedComparison); allUrls.AddRange(localGuesses); if (!findAll) { return(localGuesses); } } await FilterUrlsByFeed(remoteGuesses); if (remoteGuesses.Count > 0) { Log.FindFeedsFoundRemoteGuesses(baseUri, remoteGuesses); remoteGuesses.Sort(UrlFeedComparison); allUrls.AddRange(remoteGuesses); if (!findAll) { return(remoteGuesses); } } List <Uri> randomGuesses = FeedNames.Select(s => new Uri(baseUri, s)).ToList(); await FilterUrlsByFeed(randomGuesses); if (randomGuesses.Count > 0) { Log.FindFeedsFoundRandomGuesses(baseUri, randomGuesses); randomGuesses.Sort(UrlFeedComparison); allUrls.AddRange(randomGuesses); if (!findAll) { return(randomGuesses); } } // All done, nothing. (Or... everything!) Log.FindFeedFoundTotal(baseUri, allUrls); return(allUrls); }
/// <summary> /// Get the article title /// </summary> /// <param name="doc">The document</param> /// <returns> /// The clean title /// </returns> internal static string GetArticleTitle(IHtmlDocument doc) { var curTitle = ""; var origTitle = ""; try { curTitle = origTitle = doc.Title.Trim(); // If they had an element with id "title" in their HTML if (typeof(string) != curTitle.GetType()) { curTitle = origTitle = NodeUtility.GetInnerText(doc.GetElementsByTagName("title")[0]); } } catch (Exception e) { /* ignore exceptions setting the title. */ } var titleHadHierarchicalSeparators = false; int wordCount(String str) { return(Regex.Split(str, @"\s+").Length); } // If there's a separator in the title, first remove the final part if (curTitle.IndexOfAny(new char[] { '|', '-', '»', '/', '>' }) != -1) { titleHadHierarchicalSeparators = curTitle.IndexOfAny(new char[] { '\\', '»', '/', '>' }) != -1; curTitle = Regex.Replace(origTitle, @"(.*) [\|\-\\\/>»] .*", "$1", RegexOptions.IgnoreCase); // If the resulting title is too short (3 words or fewer), remove // the first part instead: if (wordCount(curTitle) < 3) { curTitle = Regex.Replace(origTitle, @"[^\|\-\\\/>»]* [\|\-\\\/>»](.*)", "$1", RegexOptions.IgnoreCase); } } else if (curTitle.IndexOf(": ") != -1) { // Check if we have an heading containing this exact string, so we // could assume it's the full title. var headings = NodeUtility.ConcatNodeLists( doc.GetElementsByTagName("h1"), doc.GetElementsByTagName("h2") ); var trimmedTitle = curTitle.Trim(); var match = NodeUtility.SomeNode(headings, (heading) => { return(heading.TextContent.Trim() == trimmedTitle); }); // If we don't, let's extract the title out of the original title string. if (!match) { curTitle = origTitle.Substring(origTitle.LastIndexOf(':') + 1); // If the title is now too short, try the first colon instead: if (wordCount(curTitle) < 3) { curTitle = origTitle.Substring(origTitle.IndexOf(':') + 1); } } } else if (curTitle.Length > 150 || curTitle.Length < 15) { var hOnes = doc.GetElementsByTagName("h1"); if (hOnes.Length == 1) { curTitle = NodeUtility.GetInnerText(hOnes[0]); } } curTitle = curTitle.Trim(); // If we now have 4 words or fewer as our title, and either no // 'hierarchical' separators (\, /, > or ») were found in the original // title or we decreased the number of words by more than 1 word, use // the original title. var curTitleWordCount = wordCount(curTitle); if (curTitleWordCount <= 4 && ( !titleHadHierarchicalSeparators || curTitleWordCount != wordCount(Regex.Replace(origTitle, @"[\|\-\\\/>»: ]+", " ", RegexOptions.IgnoreCase)) - 1)) { curTitle = origTitle; } return(curTitle); }
public List <Asset> GetHtmlAssets(IHtmlDocument doc) { var assets = new List <Asset>(); var images = doc.Images .Where(x => x.HasAttribute("src")); var styles = doc.GetElementsByTagName("link") .Where(l => l.Attributes["rel"].Value.Trim().ToLower() == "stylesheet") .Where(c => c.HasAttribute("href")); var scripts = doc.GetElementsByTagName("script") .Where(x => x.HasAttribute("src")); var inlineStyles = doc.GetElementsByTagName("style"); //var cssparser = new AngleSharp.Parser.Css.CssParser(); //var f = cssparser.ParseStylesheet(inlineStyles[0].InnerHtml); foreach (var inlineStyle in inlineStyles) { var inlineStyleAssets = GetCssAssets(inlineStyle.InnerHtml); assets.AddRange(inlineStyleAssets); } foreach (var image in images) { var src = image.Attributes["src"].Value; if (IsLocalPath(src) && !assets.Any(a => a.Uri == src)) { var suffix = src.Split('.').Last().Split('?').First().Split('#').First(); var asset = new Asset { Uri = src, Suffix = suffix, NewUri = Guid.NewGuid().ToString().Replace("-", "") }; assets.Add(asset); } } foreach (var css in styles) { var src = css.Attributes["href"].Value; if (IsLocalPath(src) && !assets.Any(a => a.Uri == src)) { var asset = new Asset { Uri = src, Suffix = "css", NewUri = Guid.NewGuid().ToString().Replace("-", "") }; assets.Add(asset); } } foreach (var script in scripts) { var src = script.Attributes["src"].Value; if (IsLocalPath(src) && !assets.Any(a => a.Uri == src)) { var suffix = src.Split('.').Last().Split('?').First().Split('#').First(); if (suffix == src.Split('?').First()) { suffix = "js"; } var asset = new Asset { Uri = src, Suffix = suffix, NewUri = Guid.NewGuid().ToString().Replace("-", "") }; assets.Add(asset); } } return(assets); }
protected override async Task <IEnumerable <Common.IDocument> > ExecuteContextAsync(IExecutionContext context) { #pragma warning disable RCS1163 // Unused parameter. // Handle invalid HTTPS certificates and allow alternate security protocols (see http://stackoverflow.com/a/5670954/807064) ServicePointManager.ServerCertificateValidationCallback = (s, cert, chain, ssl) => true; #pragma warning restore RCS1163 // Unused parameter. // Cache downloaded resources Dictionary <string, string> mirrorCache = new Dictionary <string, string>(); // Iterate the input documents synchronously so we don't download the same resource more than once HtmlParser parser = new HtmlParser(); return(await context.Inputs .ToAsyncEnumerable() .SelectAwait(async x => await GetDocumentAsync(x)) .ToListAsync()); async Task <Common.IDocument> GetDocumentAsync(Common.IDocument input) { IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, parser); if (htmlDocument != null) { bool modifiedDocument = false; // Link element foreach (IElement element in htmlDocument .GetElementsByTagName("link") .Where(x => x.HasAttribute("href") && !x.HasAttribute("data-no-mirror"))) { string replacement = await DownloadAndReplaceAsync(element.GetAttribute("href"), mirrorCache, context); if (replacement != null) { element.Attributes["href"].Value = replacement; modifiedDocument = true; } } // Scripts foreach (IHtmlScriptElement element in htmlDocument.Scripts .Where(x => !string.IsNullOrEmpty(x.Source) && !x.HasAttribute("data-no-mirror"))) { string replacement = await DownloadAndReplaceAsync(element.Source, mirrorCache, context); if (replacement != null) { element.Source = replacement; modifiedDocument = true; } } // Return a new document with the replacements if we performed any if (modifiedDocument) { using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html))); } } } } return(input); } }
public static string ParseName(IHtmlDocument doc) { return(Regex.Replace(doc.GetElementsByTagName("h1").FirstOrDefault().TextContent, @"<[^>]*>", "").Replace("\n", "").Trim()); }
/// <summary> /// Find all <noscript> that are located after <img> nodes, and which contain /// only one single<img> element. Replace the first image from inside the /// <noscript> tag and remove the <noscript> tag. This improves the quality of the /// images we use on some sites (e.g.Medium) /// </summary> /// <param name="doc">The document to operate on</param> internal static void UnwrapNoscriptImages(IHtmlDocument doc) { // Find img without source or attributes that might contains image, and remove it. // This is done to prevent a placeholder img is replaced by img from noscript in next step. var imgs = doc.GetElementsByTagName("img"); ForEachElement(imgs, static img => { for (var i = 0; i < img.Attributes.Length; i++) { var attr = img.Attributes[i] !; if (attr.Name is "src" or "srcset" or "data-src" or "data-srcset") { return; } if (Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)")) { return; } } img.Parent !.RemoveChild(img); }); // Next find noscript and try to extract its image var noscripts = doc.GetElementsByTagName("noscript"); ForEachElement(noscripts, static noscript => { // Parse content of noscript and make sure it only contains image var doc = (IHtmlDocument)noscript.GetRoot(); var tmp = doc.CreateElement("div"); tmp.InnerHtml = noscript.InnerHtml; if (!IsSingleImage(tmp)) { return; } // If noscript has previous sibling and it only contains image, // replace it with noscript content. However we also keep old // attributes that might contains image. if (noscript.PreviousElementSibling is IElement prevElement && IsSingleImage(prevElement)) { var prevImg = prevElement; if (prevImg.TagName is not "IMG") { prevImg = prevElement.GetElementsByTagName("img")[0]; } var newImg = tmp.GetElementsByTagName("img")[0]; for (var i = 0; i < prevImg.Attributes.Length; i++) { var attr = prevImg.Attributes[i] !; if (attr.Value is "") { continue; } if (attr.Name is "src" or "srcset" || Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)")) { if (string.Equals(newImg.GetAttribute(attr.Name), attr.Value, StringComparison.Ordinal)) { continue; } var attrName = attr.Name; if (newImg.HasAttribute(attrName)) { attrName = "data-old-" + attrName; } newImg.SetAttribute(attrName, attr.Value); } } noscript.Parent !.ReplaceChild(tmp.FirstElementChild !, prevElement); } }); }
public static List <Route.RouteType> ParseRouteTypes(IHtmlDocument doc) { string typeString = HttpUtility.HtmlDecode(doc.GetElementsByTagName("tr").FirstOrDefault(p => p.GetElementsByTagName("td").FirstOrDefault().TextContent.Contains("Type:")) .GetElementsByTagName("td")[1].TextContent).Trim(); List <Route.RouteType> result = new List <Route.RouteType>(); if (Regex.IsMatch(typeString, "BOULDER", RegexOptions.IgnoreCase)) { typeString = Regex.Replace(typeString, "BOULDER", "", RegexOptions.IgnoreCase); result.Add(Route.RouteType.Boulder); } if (Regex.IsMatch(typeString, "TRAD", RegexOptions.IgnoreCase)) //This has to go before an attempt to match "TR" so that we don't accidentally match "TR" instead of "TRAD" { typeString = Regex.Replace(typeString, "TRAD", "", RegexOptions.IgnoreCase); result.Add(Route.RouteType.Trad); } if (Regex.IsMatch(typeString, "TR|TOP ROPE", RegexOptions.IgnoreCase)) { typeString = Regex.Replace(typeString, "TR|TOP ROPE", "", RegexOptions.IgnoreCase); result.Add(Route.RouteType.TopRope); } if (Regex.IsMatch(typeString, "AID", RegexOptions.IgnoreCase)) { typeString = Regex.Replace(typeString, "AID", "", RegexOptions.IgnoreCase); result.Add(Route.RouteType.Aid); } if (Regex.IsMatch(typeString, "SPORT", RegexOptions.IgnoreCase)) { typeString = Regex.Replace(typeString, "SPORT", "", RegexOptions.IgnoreCase); result.Add(Route.RouteType.Sport); } if (Regex.IsMatch(typeString, "MIXED", RegexOptions.IgnoreCase)) { typeString = Regex.Replace(typeString, "MIXED", "", RegexOptions.IgnoreCase); result.Add(Route.RouteType.Mixed); } if (Regex.IsMatch(typeString, "ICE", RegexOptions.IgnoreCase)) { typeString = Regex.Replace(typeString, "ICE", "", RegexOptions.IgnoreCase); result.Add(Route.RouteType.Ice); } if (Regex.IsMatch(typeString, "ALPINE", RegexOptions.IgnoreCase)) { typeString = Regex.Replace(typeString, "ALPINE", "", RegexOptions.IgnoreCase); result.Add(Route.RouteType.Alpine); } if (Regex.IsMatch(typeString, "SNOW", RegexOptions.IgnoreCase)) { typeString = Regex.Replace(typeString, "SNOW", "", RegexOptions.IgnoreCase); result.Add(Route.RouteType.Snow); } return(result); }
private static async Task <Common.IDocument> ResolveDocumentXrefsAsync( Common.IDocument input, IExecutionContext context, ConcurrentDictionary <string, ConcurrentBag <string> > failures) { IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument != null) { // Find and replace "xref:" in links bool modifiedDocument = false; bool errors = false; foreach (IElement element in htmlDocument .GetElementsByTagName("a") .Where(x => x.HasAttribute("href"))) { string href = element.GetAttribute("href"); if (href.StartsWith("xref:") && href.Length > 5) { string xref = href.Substring(5); string queryAndFragment = string.Empty; int queryAndFragmentIndex = xref.IndexOfAny(new[] { '#', '?' }); if (queryAndFragmentIndex > 0) { queryAndFragment = xref.Substring(queryAndFragmentIndex); xref = xref.Substring(0, queryAndFragmentIndex); } if (context.TryGetXrefLink(xref, out string xrefLink, out string error)) { element.Attributes["href"].Value = xrefLink + queryAndFragment; } else { // Continue processing so we can report all the failures in a given document failures.AddOrUpdate( input.Source.FullPath, _ => new ConcurrentBag <string> { error }, (_, list) => { list.Add(error); return(list); }); errors = true; } modifiedDocument = true; } } // Exit if there were errors if (errors) { return(null); } // Return a new document with the replacements if we performed any if (modifiedDocument) { using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html))); } } } }
/// <summary> /// Find all <noscript> that are located after <img> nodes, and which contain /// only one single<img> element. Replace the first image from inside the /// <noscript> tag and remove the <noscript> tag. This improves the quality of the /// images we use on some sites (e.g.Medium) /// </summary> /// <param name="doc">The document to operate on</param> internal static void UnwrapNoscriptImages(IHtmlDocument doc) { // Find img without source or attributes that might contains image, and remove it. // This is done to prevent a placeholder img is replaced by img from noscript in next step. var imgs = doc.GetElementsByTagName("img"); ForEachNode(imgs, (img) => { if (img is IElement) { for (var i = 0; i < (img as IElement).Attributes.Length; i++) { var attr = (img as IElement).Attributes[i]; switch (attr.Name) { case "src": case "srcset": case "data-src": case "data-srcset": return; } if (Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)")) { return; } } img.Parent.RemoveChild(img); } }); // Next find noscript and try to extract its image var noscripts = doc.GetElementsByTagName("noscript"); ForEachNode(noscripts, (noscript) => { if (noscript is IElement) { // Parse content of noscript and make sure it only contains image var tmp = doc.CreateElement("div"); tmp.InnerHtml = (noscript as IElement).InnerHtml; if (!IsSingleImage(tmp)) { return; } // If noscript has previous sibling and it only contains image, // replace it with noscript content. However we also keep old // attributes that might contains image. var prevElement = (noscript as IElement).PreviousElementSibling; if (prevElement != null && IsSingleImage(prevElement)) { var prevImg = prevElement; if (prevImg.TagName != "IMG") { prevImg = prevElement.GetElementsByTagName("img")[0]; } var newImg = tmp.GetElementsByTagName("img")[0]; for (var i = 0; i < prevImg.Attributes.Length; i++) { var attr = prevImg.Attributes[i]; if (attr.Value == "") { continue; } if (attr.Name == "src" || attr.Name == "srcset" || Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)")) { if (newImg.GetAttribute(attr.Name) == attr.Value) { continue; } var attrName = attr.Name; if (newImg.HasAttribute(attrName)) { attrName = "data-old-" + attrName; } newImg.SetAttribute(attrName, attr.Value); } } noscript.Parent.ReplaceChild(tmp.FirstElementChild, prevElement); } } }); }
public IHtmlDocument BBCodeConversion(IHtmlDocument doc) { if (!BBIgnoringLinks) { foreach (var element in doc.All.OfType <IHtmlAnchorElement>()) { element.OuterHtml = $"[url={element.Href}]{element.InnerHtml}[/url]"; } } if (!BBIgnoringImages) { foreach (var element in doc.All.OfType <IHtmlImageElement>()) { element.OuterHtml = $"[img]{element.Source}[/img]"; } } if (!BBIgnoringLists) { foreach (var element in doc.All.OfType <IHtmlOrderedListElement>()) { element.OuterHtml = $"[olist]{element.InnerHtml}[/olist]"; } foreach (var element in doc.All.OfType <IHtmlUnorderedListElement>()) { element.OuterHtml = $"[list]{element.InnerHtml}[/list]"; } foreach (var element in doc.All.OfType <IHtmlListItemElement>()) { element.OuterHtml = $"[*]{element.InnerHtml}"; } } if (!BBIgnoringHeaders) { foreach (var element in doc.All.OfType <IHtmlHeadingElement>()) { //Ugly placeholder for now. var comparer = StringComparison.OrdinalIgnoreCase; var text = element.OuterHtml; text = text.Replace("<h1>", "[h1]", comparer).Replace("<h2>", "[h2]", comparer).Replace("<h3>", "[h3]", comparer).Replace("<h4>", "[h4]", comparer).Replace("<h5>", "[h5]", comparer).Replace("<h6>", "[h6]", comparer); text = text.Replace("</h1>", "[/h1]", comparer).Replace("</h2>", "[/h2]", comparer).Replace("</h3>", "[/h3]", comparer).Replace("</h4>", "[/h4]", comparer).Replace("</h5>", "[/h5]", comparer).Replace("</h6>", "[/h6]", comparer); element.OuterHtml = text; } } if (BBIgnoredTags.IndexOf(TagNames.Strong) < 0) { foreach (var element in doc.GetElementsByTagName(TagNames.Strong)) { element.OuterHtml = $"[b]{element.InnerHtml}[/b]"; } } if (BBIgnoredTags.IndexOf(TagNames.Em) < 0) { foreach (var element in doc.GetElementsByTagName(TagNames.Em)) { element.OuterHtml = $"[i]{element.InnerHtml}[/i]"; } } if (BBIgnoredTags.IndexOf(TagNames.U) < 0) { foreach (var element in doc.GetElementsByTagName(TagNames.U)) { element.OuterHtml = $"[u]{element.InnerHtml}[/u]"; } } if (BBIgnoredTags.IndexOf(TagNames.P) < 0 && BBIgnoredElements.IndexOf(typeof(IHtmlParagraphElement)) < 0) { foreach (var element in doc.All.OfType <IHtmlParagraphElement>()) { element.OuterHtml = element.InnerHtml + Environment.NewLine; } } if (BBIgnoredTags.IndexOf(TagNames.Code) < 0) { foreach (var element in doc.GetElementsByTagName(TagNames.Code)) { element.OuterHtml = $"[code]{element.InnerHtml}[/code]"; } } if (BBIgnoredTags.IndexOf(TagNames.Pre) < 0) { foreach (var element in doc.GetElementsByTagName(TagNames.Pre)) { element.OuterHtml = $"[noparse]{element.InnerHtml}[/noparse]"; } } if (BBIgnoredTags.IndexOf(TagNames.Strike) < 0) { foreach (var element in doc.GetElementsByTagName(TagNames.Strike)) { element.OuterHtml = $"[strike]{element.InnerHtml}[/strike]"; } } /* * foreach (var element in doc.All) * { * Log.Here().Activity($"Element | Type {element.GetType()} Content: {element.OuterHtml}"); * } */ return(doc); }
public RaceModel Parse(IHtmlDocument document) { var trace = document.GetElementsByClassName("ResultsArchiveTitle").First(); var raceName = trace.InnerHtml.Trim(); string name = raceName.Substring(0, raceName.IndexOf('\n')); var tbody = document.GetElementsByTagName("tbody").First(); var trs = tbody.GetElementsByTagName("tr"); var list = new List <RacingResult>(); foreach (var item in trs) { var racingResult = new RacingResult(); racingResult.Pos = item.Children[1].InnerHtml; racingResult.No = int.Parse(item.Children[2].InnerHtml); racingResult.DriverFirstName = item.Children[3].Children[0].InnerHtml; racingResult.DriverLastName = item.Children[3].Children[1].InnerHtml; racingResult.DriverShortName = item.Children[3].Children[2].InnerHtml; racingResult.Car = item.Children[4].InnerHtml; racingResult.Laps = int.Parse(item.Children[5].InnerHtml); string timeStr = item.Children[6].InnerHtml; if (item.Children[6].ChildElementCount == 0) { if (timeStr == "DNF" || timeStr == "DNS") { racingResult.Time = null; } else { racingResult.Time = TimeSpan.Parse(timeStr, CultureInfo.InvariantCulture); } } else { string secondStr = timeStr.Substring(0, timeStr.IndexOf('<')); decimal seconds = decimal.Parse(secondStr, CultureInfo.InvariantCulture); racingResult.Retired = new TimeSpan(0, 0, 0, (int)seconds, (int)((seconds - (int)seconds) * 1000)); } racingResult.Pts = int.Parse(item.Children[7].InnerHtml); list.Add(racingResult); } var table = document.GetElementsByClassName("resultsarchive-col-right").First().OuterHtml; //document.Body.InnerHtml = table; //table = document.Source.Text; return(new RaceModel() { RaceName = name, RaceResults = list.ToArray(), ResultTable = table }); }