public override async Task <WebNovelChapter> GetChapterAsync(ChapterLink link, ChapterRetrievalOptions options = default(ChapterRetrievalOptions), CancellationToken token = default(CancellationToken)) { string content = await GetWebPageAsync(link.Url, token); IHtmlDocument doc = await Parser.ParseAsync(content, token); IElement titleElement = doc.DocumentElement.QuerySelector(".chapter-title"); IElement chapterElement = doc.DocumentElement.QuerySelector(".chapter-body"); // Append paragraphs after each "sentence.translated" element. chapterElement .QuerySelectorAll("sentence.translated") .ToList() .ForEach((obj) => obj.AppendChild(doc.CreateElement("P"))); var contentEl = doc.CreateElement("P"); contentEl.InnerHtml = string.Join("", chapterElement .QuerySelectorAll("sentence.translated") .Select(x => x.InnerHtml)); RemoveSpecialTags(doc, contentEl); string nextChapter = doc.QuerySelector("ul.pager > li.next > a")?.GetAttribute("href"); return(new WebNovelChapter { ChapterName = titleElement?.GetInnerText(), Content = new ContentCleanup(BaseUrl).Execute(doc, contentEl), NextChapterUrl = nextChapter }); }
public static INode CreateHidden(this IHtmlDocument document, string name, string value) { var challengeAttr = document.CreateElement("input"); document.CreateElement("input"); challengeAttr.SetAttribute("name", name); challengeAttr.SetAttribute("value", value); challengeAttr.SetAttribute("type", "hidden"); return(challengeAttr); }
private void SetNav(IHtmlDocument indexDoc, IEnumerable <string> postStrings) { var tds = indexDoc.QuerySelectorAll("#pageBar tr.pages_str td"); var tdPrev = tds.First(); var tdNext = tds.Last(); var anchorPrev = indexDoc.CreateElement("a"); anchorPrev.TextContent = "< предыдущая"; anchorPrev.SetAttribute("href", "#"); anchorPrev.Id = "anchorPrev"; tdPrev.InnerHtml = ""; tdPrev.AppendChild(anchorPrev); var anchorNext = indexDoc.CreateElement("a"); anchorNext.TextContent = "следующая >"; anchorNext.SetAttribute("href", "#"); anchorNext.Id = "anchorNext"; tdNext.InnerHtml = ""; tdNext.AppendChild(anchorNext); var td = indexDoc.QuerySelectorAll("#pageBar tr").Last().QuerySelector("td"); td.InnerHtml = ""; td.Id = "tdPages"; // var pageCount = Convert.ToInt32(Math.Ceiling(1.0 * indexDoc.QuerySelectorAll("div.singlePost").Count() / Constants.ArchivePageSize)); // for (var i = 1; i <= pageCount; i++) // { // var pageAnchor = indexDoc.CreateElement(i == 1 ? "strong" : "a"); // pageAnchor.InnerHtml = Convert.ToString(i); // pageAnchor.SetAttribute("href", $"#{i}"); // pageAnchor.ClassList.Add("pageAnchor"); // pageAnchor.SetAttribute("page", i.ToString()); // td.AppendChild(pageAnchor); // } var serializedData = JsonConvert.SerializeObject(postStrings.Reverse(), new JsonSerializerSettings { ContractResolver = new CamelCasePropertyNamesContractResolver(), Formatting = Formatting.Indented }); var script = indexDoc.CreateElement("script"); script.SetAttribute("type", "text/javascript"); script.InnerHtml = $@" $(function(){{ postStrings = {serializedData}; initPages({Constants.ArchivePageSize}); }}); "; indexDoc.QuerySelector("head").AppendChild(script); }
public static Post ParsePost(IHtmlDocument parent, IElement doc) { var post = new Post(); post.User = UserHandler.ParseUserFromPost(doc); post.PostId = Convert.ToInt64(doc.Id.Replace("post", "")); var authorTd = doc.QuerySelector(@"[class*=""userid""]"); authorTd.Remove(); post.HasSeen = doc.QuerySelector(@"[class=""seen1""]") != null || doc.QuerySelector(@"[class=""seen2""]") != null; var threadBody = doc.QuerySelector(".postbody"); if (threadBody != null) { var jerkBody = threadBody.QuerySelector(@"a[title=""DON'T DO IT!!""]"); if (jerkBody != null) { post.IsIgnored = true; } else { var imgurGifs = threadBody.QuerySelectorAll(@"[src*=""imgur.com""][src*="".gif""]"); for (var i = 0; i < imgurGifs.Length; i++) { var imgurGif = imgurGifs[i]; var div = parent.CreateElement("div"); div.ClassList.Add("gifWrap"); var newImgur = parent.CreateElement("img"); newImgur.ClassList.Add("imgurGif"); newImgur.SetAttribute("data-originalurl", imgurGif.GetAttribute("src")); newImgur.SetAttribute("data-posterurl", imgurGif.GetAttribute("src").Replace(".gif", "h.jpg")); newImgur.SetAttribute("src", imgurGif.GetAttribute("src").Replace(".gif", "h.jpg")); div.AppendChild(newImgur); imgurGif.Replace(div); } var attachments = threadBody.QuerySelectorAll(@"[src*=""attachment.php""]"); foreach (var attachment in attachments) { attachment.SetAttribute("src", $"https://forums.somethingawful.com/{attachment.Attributes["src"].Value}"); } } post.PostHtml = HtmlEncode(threadBody.InnerHtml); } return(post); }
private void ParseHTagsAndAddAnchors(IHtmlDocument document, IElement parentElement) { foreach (IElement element in parentElement.Children) { string tagName = element.NodeName.ToLower(); string title = element.TextContent; if (tagName.StartsWith("h") && tagName.Length == 2) { // Use the H number (e.g. 2 for H2) as the current level in the tree int level = 0; int.TryParse(tagName.ToLower().Replace("h", ""), out level); // Level sanity check for bad markup if (level > 1) { Item item = _tree.AddItemAtLevel(level, title); // Insert an achor tag after the header as a reference IElement anchor = document.CreateElement("a"); anchor.SetAttribute("name", item.Id); element.InnerHtml = anchor.OuterHtml + element.InnerHtml; } } else if (element.HasChildNodes) { ParseHTagsAndAddAnchors(document, element); } } }
protected virtual void TransformHeadings(IHtmlDocument document, int from, int to) { var fromNodes = document.QuerySelectorAll($"h{from}"); foreach (var fromNode in fromNodes) { var parent = fromNode.Parent; if (to == 5) { ReplaceChildElementByText(parent, fromNode, document); } else { var newElement = document.CreateElement($"h{to}"); newElement.InnerHtml = fromNode.InnerHtml; // Copy the text alignment style if (fromNode.Style != null && !string.IsNullOrEmpty(fromNode.Style.TextAlign)) { newElement.Style.TextAlign = fromNode.Style.TextAlign; } parent.ReplaceChild(newElement, fromNode); } } }
/// <summary> /// In-lines the CSS for the current HTML /// </summary> /// <param name="removeStyleElements">If set to <c>true</c> the style elements are removed.</param> /// <param name="ignoreElements">CSS selector for STYLE elements to ignore (e.g. mobile-specific styles etc.)</param> /// <param name="css">A string containing a style-sheet for inlining.</param> /// <param name="stripIdAndClassAttributes">True to strip ID and class attributes</param> /// <param name="removeComments">True to remove comments, false to leave them intact</param> /// <param name="keepMediaQueries">True to add back any mediaqueries</param> /// <returns>Returns the html input, with styles moved to inline attributes.</returns> public InlineResult MoveCssInline(bool removeStyleElements = false, string ignoreElements = null, string css = null, bool stripIdAndClassAttributes = false, bool removeComments = false, bool keepMediaQueries = false) { // Store the variables used for inlining the CSS _removeStyleElements = removeStyleElements; _stripIdAndClassAttributes = stripIdAndClassAttributes; _ignoreElements = ignoreElements; _keepMediaQueries = keepMediaQueries; _css = css; // Gather all of the CSS that we can work with. var cssSourceNodes = CssSourceNodes(); var cssLinkNodes = CssLinkNodes(); var cssSources = new List <ICssSource>(ConvertToStyleSources(cssSourceNodes)); cssSources.AddRange(ConvertToStyleSources(cssLinkNodes)); var cssBlocks = GetCssBlocks(cssSources); if (_removeStyleElements) { RemoveStyleElements(cssSourceNodes); RemoveStyleElements(cssLinkNodes); } var joinedStyles = Join(cssBlocks); var joinedBlocks = joinedStyles.Styles; var mediaQueries = joinedStyles.MediaQueries; var validSelectors = CleanUnsupportedSelectors(joinedBlocks); var elementsWithStyles = FindElementsWithStyles(validSelectors); var mergedStyles = MergeStyleClasses(elementsWithStyles); StyleClassApplier.ApplyAllStyles(mergedStyles); if (_stripIdAndClassAttributes) { StripElementAttributes("id", "class"); } if (removeStyleElements && _keepMediaQueries) { var styleElem = _document.CreateElement("style"); styleElem.TextContent = String.Join(",", mediaQueries); _document.Body.Prepend(styleElem); } if (removeComments) { var comments = _document.Descendents <IComment>().ToList(); foreach (var comment in comments) { comment.Remove(); } } var html = _document.ToHtml(new AutoSelectedMarkupFormatter(_document.Doctype)); return(new InlineResult(html, _warnings)); }
public static void SaveDefault(IHtmlDocument doc, string text, string type) { var script = doc.CreateElement("script"); script.TextContent = text; script.SetAttribute("type", type); doc.Head.AppendChild(script); }
private void SetMetaElements(IHeadElementHelperStore store, IHtmlDocument doc) { if (store.MetaElementCommands.Count == 0) { return; } var metaTags = doc.Head.QuerySelectorAll("meta[name],meta[property],meta[http-equiv]").Cast <IHtmlMetaElement>().ToList(); var metaElements = metaTags.Select(m => new MetaElement { Name = m.Name ?? "", Property = m.GetAttribute("property") ?? "", HttpEquiv = m.HttpEquivalent ?? "", Content = m.Content }); SaveDefault(doc, metaElements, "text/default-meta-elements"); foreach (var cmd in store.MetaElementCommands) { var meta = metaTags.FirstOrDefault(m => (cmd.Element.Name != "" && cmd.Element.Name == m.Name) || (cmd.Element.Property != "" && cmd.Element.Property == m.GetAttribute("property")) || (cmd.Element.HttpEquiv != "" && cmd.Element.HttpEquiv == m.HttpEquivalent)); if (cmd.Operation == MetaElementOperations.Set) { if (meta == null) { meta = doc.CreateElement("meta") as IHtmlMetaElement; if (cmd.Element.Name != "") { meta.Name = cmd.Element.Name; } if (cmd.Element.Property != "") { meta.SetAttribute("property", cmd.Element.Property); } if (cmd.Element.HttpEquiv != "") { meta.HttpEquivalent = cmd.Element.HttpEquiv; } doc.Head.AppendChild(meta); metaTags.Add(meta); } meta.Content = cmd.Element.Content; } else if (cmd.Operation == MetaElementOperations.Remove) { if (meta != null) { doc.Head.RemoveChild(meta); metaTags.Remove(meta); } } } }
private void AddScripts(IHtmlDocument doc, string dirPrefix) { foreach (var script in doc.QuerySelectorAll("script")) { script.Remove(); } var scriptElPost = doc.CreateElement("script"); scriptElPost.SetAttribute("type", "text/javascript"); scriptElPost.SetAttribute("src", dirPrefix + Constants.AccountPagesDir + "/" + Constants.JQueryFileName); doc.QuerySelector("head").AppendChild(scriptElPost); scriptElPost = doc.CreateElement("script"); scriptElPost.SetAttribute("type", "text/javascript"); scriptElPost.SetAttribute("src", dirPrefix + Constants.AccountPagesDir + "/" + Constants.DiaryJsFileName); doc.QuerySelector("head").AppendChild(scriptElPost); }
protected virtual void TransformBlockQuotes(IHtmlCollection <IElement> blockQuotes, IHtmlDocument document) { int level = 1; INode blockParent = null; foreach (var blockQuote in blockQuotes) { var parent = blockQuote.Parent; if (blockQuote.OuterHtml.ToLower().Contains("margin:0px 0px 0px 40px")) { if (blockQuote.ChildElementCount > 0 && blockQuote.Children[0].TagName.ToLower() == "blockquote") { blockParent = blockQuote; level++; } else { var newElement = document.CreateElement($"p"); // Drop P as nested P is not allowed in clean html // TODO: do this in a better way newElement.InnerHtml = blockQuote.InnerHtml.Replace("<p>", "").Replace("</p>", "").Replace("<P>", "").Replace("</P>", ""); newElement.SetAttribute($"style", $"margin-left:{level * 40}px;"); switch (level) { case 1: { parent.ReplaceChild(newElement, blockQuote); break; } case 2: { blockParent.Parent.ReplaceChild(newElement, blockParent); break; } case 3: { blockParent.Parent.Parent.ReplaceChild(newElement, blockParent.Parent); break; } case 4: { blockParent.Parent.Parent.Parent.ReplaceChild(newElement, blockParent.Parent.Parent); break; } } level = 1; } } } }
private void PostProcessTransformDanglingTextToPElem(IElement target) { foreach (var child in target.Children.Where(p => p.NodeType == NodeType.Text).ToList()) { var newElem = rootDoc.CreateElement("p"); newElem.TextContent = child.TextContent.RegexTrimAndNormalize(); child.Parent?.ReplaceChild(newElem, child); PostProcessTransformDanglingTextToPElem(child); } }
/// <summary> /// Converts each <a> and <img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> /// <param name="uri">The base uri</param> /// <param name="doc">The document to operate on</param> internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!String.IsNullOrWhiteSpace(href)) { // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { // if the link only contains simple text content, it can be converted to a text node if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { // if the link has multiple children, they should all be preserved var container = doc.CreateElement("span"); while (link.ChildNodes.Length > 0) { container.AppendChild(link.ChildNodes[0]); } link.Parent.ReplaceChild(container, link); } } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var imgs = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img" }); NodeUtility.ForEachNode(imgs, (img) => { var src = (img as IElement).GetAttribute("src"); if (!String.IsNullOrWhiteSpace(src)) { (img as IElement).SetAttribute("src", uri.ToAbsoluteURI(src)); } }); }
private void CleanHtml(IElement element, IHtmlDocument document) { foreach (var node in element.QuerySelectorAll("*").ToList()) { if (node.ParentElement != null && IsUntransformableBlockElement(node)) { // create new div node and add all current children to it var div = document.CreateElement("div"); foreach (var child in node.ChildNodes.ToList()) { div.AppendChild(child); } // replace the unsupported node with the new div node.ParentElement.ReplaceChild(div, node); } } }
private static void DetectVoidParagraph(IHtmlDocument doc, IElement ell) { if (!voidptags.Contains(ell.TagName.ToLower())) { return; } if (ell.Children.Length == 0 && string.IsNullOrWhiteSpace(ell.TextContent)) { var br = doc.CreateElement("br"); br.ClassName = "breakline"; ell.AppendChild(br); return; } if (ell.Children.Length == 1) { DetectVoidParagraph(doc, ell.Children[0]); return; } return; }
protected virtual void TransformHeadings(IHtmlDocument document, int from, int to) { var fromNodes = document.QuerySelectorAll($"h{from}"); foreach (var fromNode in fromNodes) { var parent = fromNode.Parent; if (to == 5) { ReplaceChildElementByText(parent, fromNode, document); } else { var newElement = document.CreateElement($"h{to}"); newElement.TextContent = fromNode.TextContent; parent.ReplaceChild(newElement, fromNode); } } }
public void ModifyTree(IHtmlDocument document, string path) { foreach (var form in document.QuerySelectorAll("form")) { if (form.QuerySelector("input[type=password]") is null) { continue; } var warning = document.CreateElement("div"); warning.SetAttribute("style", "color: red; font-size: 3em; font-weight: bold"); warning.TextContent = "Web běží na magické proxy, které byste měli věřit!!!"; form.Prepend(warning); } var encyklopedie = document.QuerySelectorAll("#menu ul li:not(.active) a").FirstOrDefault(x => x.TextContent.Trim() == "Encyklopedie"); if (encyklopedie is object) { encyklopedie.TextContent = "Kurzy"; encyklopedie.SetAttribute("href", "/kurz"); if ("kurz" == path) { foreach (var li in document.QuerySelectorAll("#menu ul li.active")) { li.ClassList.Remove("active"); } encyklopedie.ParentElement.ClassList.Add("active"); } } var logo = document.QuerySelector("#logo > h1:nth-child(1) > a:nth-child(1)"); if (logo is object) { logo.TextContent = "KSP Hacked Edition"; } }
protected virtual void ImageIFramePlaceHolders(IHtmlDocument document) { var images = document.QuerySelectorAll("img"); var iframes = document.QuerySelectorAll("iframe"); var elements = images.Union(iframes); foreach (var element in elements) { // Add a text content in place of the element string webPartType = ""; string sourceValue = ""; var source = element.Attributes.Where(p => p.Name.Equals("src", StringComparison.InvariantCultureIgnoreCase)).FirstOrDefault(); if (source != null) { sourceValue = source.Value; } if (element is IHtmlImageElement) { webPartType = "Image"; } else if (element is IHtmlInlineFrameElement) { webPartType = "IFrame"; } string placeHolder = $"***{webPartType} placeholder for source {sourceValue}***"; // Create P element and insert it just before our current image or iframe element var newElement = document.CreateElement($"P"); newElement.TextContent = placeHolder; if (element.Parent != null) { element.Parent.InsertBefore(newElement, element); } } }
protected async Task <bool> FixMore(IHtmlDocument doc) { await DetectMoreType(); if (this._moreType == DiaryMoreLinksType.Preloaded) { return(false); } var moreLinks = doc.QuerySelectorAll("a.LinkMore"); var actualLinks = (from moreLink in moreLinks let href = moreLink.GetAttribute("href") where !string.IsNullOrEmpty(href) && href.ToLower() != "#more" select moreLink).ToList(); if (actualLinks.Count <= 0) { return(false); } if (_moreType == DiaryMoreLinksType.OnDemand) { var dataToLoad = (from link in actualLinks let matches = Regex.Matches(link.GetAttribute("onclick"), @"\""([^\""]*)\""") where matches.Count > 1 select new { LinkElement = link, Url = $"http://{_diaryName}.diary.ru{matches[1].Groups[1].Value}?post={matches[0].Groups[1].Value}&js", MorePartName = matches[0].Groups[1].Value } ).ToList(); var resources = dataToLoad.Select(d => new DownloadResource { Url = d.Url }); var downloadResults = await _dataDownloader.Download(resources); var results = (from d in dataToLoad from r in downloadResults where d.Url == r.Resource.Url select new { d.LinkElement, d.Url, r.DownloadedData, d.MorePartName }) .ToList(); foreach (var r in results) { var match = Regex.Match(r.DownloadedData.AsAnsiString(), @"innerHTML\s*=\s*'([^']*)'"); if (!match.Success) { continue; } var htmlText = match.Groups[1].Value; var spanId = $"more{r.MorePartName}"; var spanElement = doc.QuerySelector($"#{spanId}"); if (spanElement == null) { continue; } spanElement.InnerHtml = htmlText; } } else if (_moreType == DiaryMoreLinksType.FullPage) { var resource = new DownloadResource { Url = actualLinks[0].GetAttribute("href") }; var downloadResult = await _dataDownloader.Download(resource, false, 1000); var docFull = await _parser.ParseAsync(downloadResult.DownloadedData.AsAnsiString()); foreach (var link in actualLinks) { var match = Regex.Match(link.GetAttribute("href"), @"\/p(\d*).html?\?oam#(.*)$"); if (!match.Success) { continue; } var postNum = match.Groups[1].Value; var moreName = match.Groups[2].Value; var elementStart = docFull.QuerySelector($"a[name='{moreName}']"); var elementEnd = docFull.QuerySelector($"a[name='{moreName}end']"); if (elementStart == null || elementEnd == null) { continue; } var newDiv = docFull.CreateElement("div"); elementStart.Before(newDiv); var nodesToCopy = new List <INode>(); var currentNode = elementStart.NextSibling; while (currentNode != null) { if (currentNode == elementEnd) { break; } nodesToCopy.Add(currentNode); currentNode = currentNode.NextSibling; } foreach (var el in nodesToCopy) { newDiv.AppendChild(el); } var moreHtml = newDiv.InnerHtml; var moreSpanId = "more" + postNum + "m" + moreName.Substring(4); var newMoreSpan = doc.CreateElement("span"); newMoreSpan.Id = moreSpanId; newMoreSpan.Style.Display = "none"; newMoreSpan.Style.Visibility = "hidden"; link.After(newMoreSpan); link.Id = "link" + moreSpanId; newMoreSpan.InnerHtml = moreHtml; } } return(true); }
protected virtual void TransformTables(IHtmlCollection <IElement> tables, IHtmlDocument document) { // TODO: what about nested tables? foreach (var table in tables) { // <div class="canvasRteResponsiveTable"> var newTableElement = document.CreateElement($"div"); newTableElement.ClassName = "canvasRteResponsiveTable"; // <div class="tableCenterAlign tableWrapper"> var innerDiv = document.CreateElement("div"); // Possible alignments: tableLeftAlign, tableCenterAlign and tableRightAlign, since wiki does not have this option default to left align innerDiv.ClassList.Add(new string[] { "tableLeftAlign", "tableWrapper" }); newTableElement.AppendChild(innerDiv); // <table class="bandedRowTableStyleNeutral" title="Table"> var tableElement = document.CreateElement("table"); //ms-rteTable-default: basic grid lines string tableClassName = "simpleTableStyleNeutral"; if (!string.IsNullOrEmpty(table.ClassName)) { if (table.ClassName.Equals("ms-rteTable-default", StringComparison.InvariantCultureIgnoreCase)) { tableClassName = "simpleTableStyleNeutral"; } else { if (int.TryParse(table.ClassName.ToLower().Replace("ms-rtetable-", ""), out int tableStyleCode)) { tableClassName = TableStyleCodeToName(tableStyleCode); } } } tableElement.ClassName = tableClassName; tableElement.SetAttribute("title", "Table"); innerDiv.AppendChild(tableElement); // <tbody> var tableBody = document.CreateElement("tbody"); tableElement.AppendChild(tableBody); // Iterate the table rows var tableBodyElement = (table as IHtmlTableElement).Bodies[0]; var rows = tableBodyElement.Children.Where(p => p.TagName.Equals("tr", StringComparison.InvariantCultureIgnoreCase)); if (rows != null && rows.Count() > 0) { // TODO: col and row spans are not yet supported in RTE but do seem to work...verify foreach (var row in rows) { var newRow = document.CreateElement("tr"); // check for table headers var tableHeaders = row.Children.Where(p => p.TagName.Equals("th", StringComparison.InvariantCultureIgnoreCase)); if (tableHeaders != null && tableHeaders.Count() > 0) { foreach (var tableHeader in tableHeaders) { var tableHeaderValue = document.CreateElement("strong"); tableHeaderValue.TextContent = tableHeader.TextContent; var tableHeaderCell = document.CreateElement("td"); tableHeaderCell.AppendChild(tableHeaderValue); // take over row and col spans var rowSpan = tableHeader.GetAttribute("rowspan"); if (!string.IsNullOrEmpty(rowSpan) && rowSpan != "1") { tableHeaderCell.SetAttribute("rowspan", rowSpan); } var colSpan = tableHeader.GetAttribute("colspan"); if (!string.IsNullOrEmpty(colSpan) && colSpan != "1") { tableHeaderCell.SetAttribute("colspan", colSpan); } newRow.AppendChild(tableHeaderCell); } } // check for table cells var tableCells = row.Children.Where(p => p.TagName.Equals("td", StringComparison.InvariantCultureIgnoreCase)); if (tableCells != null && tableCells.Count() > 0) { foreach (var tableCell in tableCells) { var newTableCell = document.CreateElement("td"); newTableCell.TextContent = tableCell.TextContent; // take over row and col spans var rowSpan = tableCell.GetAttribute("rowspan"); if (!string.IsNullOrEmpty(rowSpan) && rowSpan != "1") { newTableCell.SetAttribute("rowspan", rowSpan); } var colSpan = tableCell.GetAttribute("colspan"); if (!string.IsNullOrEmpty(colSpan) && colSpan != "1") { newTableCell.SetAttribute("colspan", colSpan); } newRow.AppendChild(newTableCell); } } tableBody.AppendChild(newRow); } } // Swap old table with new table table.Parent.ReplaceChild(newTableElement, table); } }
private static IElement ParseObject(XamlPropertyValue @object, IHtmlDocument htmlDocument, IElement outerElement) { IElement element = null; if (@object is XamlObject) { bool alreadyAdded = false; bool childsParsed = false; var xamlObject = (XamlObject)@object; switch (xamlObject.ElementType.Name) { case "Viewbox": { element = htmlDocument.CreateElement("div"); //todo: stretch, zoom?? break; } case "Border": { element = htmlDocument.CreateElement("div"); break; } case "Canvas": { element = htmlDocument.CreateElement("div"); ((IHtmlElement)element).Style.Position = "absolute"; break; } case "StackPanel": { element = htmlDocument.CreateElement("div"); ((IHtmlElement)element).Style.Display = "flex"; ((IHtmlElement)element).Style.FlexDirection = "column"; break; } case "WrapPanel": { element = htmlDocument.CreateElement("div"); ((IHtmlElement)element).Style.Display = "flex"; ((IHtmlElement)element).Style.FlexWrap = "wrap"; ((IHtmlElement)element).Style.FlexDirection = "column"; break; } case "DockPanel": { element = htmlDocument.CreateElement("div"); ((IHtmlElement)element).Style.Display = "flex"; ((IHtmlElement)element).Style.FlexDirection = "column"; break; } case "Grid": { var tbl = htmlDocument.CreateElement("table"); ((IHtmlElement)tbl).Style.Width = "100%"; ((IHtmlElement)tbl).Style.Height = "100%"; outerElement.AppendChild(tbl); alreadyAdded = true; childsParsed = true; var grid = xamlObject.Instance as Grid; foreach (var xamlProperty in xamlObject.Properties.Where(x => x.PropertyName != "Children")) { ParseProperty(xamlProperty, htmlDocument, (IHtmlElement)tbl); } var children = xamlObject.Properties.FirstOrDefault(x => x.PropertyName == "Children"); for (int n = 0; n < (grid.RowDefinitions.Count > 0 ? grid.RowDefinitions.Count : 1); n++) { var row = htmlDocument.CreateElement("tr"); ((IHtmlElement)row).Style.VerticalAlign = "top"; tbl.AppendChild(row); if (grid.RowDefinitions.Count > 0) { var rd = grid.RowDefinitions[n]; ((IHtmlElement)row).Style.Height = ParseGridLenth(rd.Height); } row.ClassList.Add("visuGrid"); for (int p = 0; p < (grid.ColumnDefinitions.Count > 0 ? grid.ColumnDefinitions.Count : 1); p++) { var td = htmlDocument.CreateElement("td"); td.ClassList.Add("visuGrid"); row.AppendChild(td); element = htmlDocument.CreateElement("div"); td.AppendChild(element); ((IHtmlElement)element).Style.Width = "100%"; ((IHtmlElement)element).Style.Height = "100%"; if (grid.ColumnDefinitions.Count > 0) { var rd = grid.ColumnDefinitions[p]; ((IHtmlElement)td).Style.Width = ParseGridLenth(rd.Width); } //Row Col Span should be used var p1 = p; var n1 = n; var childs = children.CollectionElements.OfType<XamlObject>().Where(x => Grid.GetColumn((UIElement)x.Instance) == p1 && Grid.GetRow((UIElement)x.Instance) == n1); foreach (var child in childs) { var el = ParseObject(child, htmlDocument, element); //((IHtmlElement) el).Style.Position = null; } } } element = tbl; break; } case "Image": { element = htmlDocument.CreateElement("div"); break; } case "Rectangle": { element = htmlDocument.CreateElement("div"); break; } case "Button": { element = htmlDocument.CreateElement("button"); break; } case "TextBlock": { element = htmlDocument.CreateElement("span"); break; } case "TextBox": { element = htmlDocument.CreateElement("input"); element.SetAttribute("type", "text"); break; } default: { break; } } if (element != null) { if (xamlObject.ParentObject != null && (xamlObject.ParentObject.Instance is Grid || xamlObject.ParentObject.Instance is Canvas)) { //((IHtmlElement) element).Style.Position = "absolute"; } if (xamlObject.ParentObject != null && xamlObject.ParentObject.Instance is Grid) { if (((FrameworkElement)xamlObject.Instance).HorizontalAlignment != HorizontalAlignment.Stretch) { SetFixedWidth((IHtmlElement)element, xamlObject); } else { ((IHtmlElement)element).Style.Width = "100%"; } if (((FrameworkElement)xamlObject.Instance).VerticalAlignment != VerticalAlignment.Stretch) { SetFixedHeight((IHtmlElement)element, xamlObject); } else { ((IHtmlElement)element).Style.Height = "100%"; } } else { SetFixedWidth((IHtmlElement)element, xamlObject); SetFixedHeight((IHtmlElement)element, xamlObject); } } if (element != null && !childsParsed) { foreach (var xamlProperty in xamlObject.Properties) { ParseProperty(xamlProperty, htmlDocument, (IHtmlElement)element); } if (!alreadyAdded) { outerElement.AppendChild(element); } } } else if (@object is XamlTextValue) { var text = @object as XamlTextValue; outerElement.TextContent = text.Text; } return element; }
protected virtual void TransformElements(IHtmlCollection <IElement> elementsToTransform, IHtmlDocument document) { foreach (var element in elementsToTransform) { var parent = element.Parent; // rewrite normal style // <span class="ms-rteStyle-Normal">Norm</span> var rtestylenormal = element.ClassList.PartialMatch("ms-rtestyle-normal"); if (!string.IsNullOrEmpty(rtestylenormal)) { element.ClassList.Remove(rtestylenormal); } // ================================ // rewrite colors, back and fore color + size can be defined as class on a single span element // ================================ // <span class="ms-rteThemeForeColor-5-0">red</span> var themeForeColor = element.ClassList.PartialMatch("ms-rtethemeforecolor-"); if (!string.IsNullOrEmpty(themeForeColor)) { string newClass = null; // Modern Theme colors // Darker, Dark, Dark Alternate, Primary, Secondary // Neutral Tertiary, Neutral Secondary, Primary alternate, Neutral primary, Neutral Dark if (int.TryParse(themeForeColor.ToLower()[themeForeColor.ToLower().Length - 1].ToString(), out int themeCode)) { string colorName = ThemeCodeToForegroundColorName(themeCode); if (!string.IsNullOrEmpty(colorName)) { newClass = $"fontColor{colorName}"; } } element.ClassList.Remove(themeForeColor); if (!string.IsNullOrEmpty(newClass)) { // We mapped a color element.ClassList.Add(newClass); } } // <span class="ms-rteThemeBackColor-5-0">red</span> var rtethemebackcolor = element.ClassList.PartialMatch("ms-rtethemebackcolor-"); if (!string.IsNullOrEmpty(rtethemebackcolor)) { // There are no themed back colors in modern, so for now drop the color span and the background color element.ClassList.Remove(rtethemebackcolor); } //<span class="ms-rteForeColor-2" style="">Red, </span> //<sup class="ms-rteForeColor-10" style=""><strong style="">superscript</strong></sup> var rteforecolor = element.ClassList.PartialMatch("ms-rteforecolor-"); if (!string.IsNullOrEmpty(rteforecolor)) { // Modern Theme colors // Dark Red, Red, Orange, Yellow, Light green // Green, Light Blue, Blue, Dark Blue, Purple string newClass = null; if (int.TryParse(rteforecolor.ToLower().Replace("ms-rteforecolor-", ""), out int colorCode)) { string colorName = ColorCodeToForegroundColorName(colorCode); if (!string.IsNullOrEmpty(colorName)) { newClass = $"fontColor{colorName}"; } } element.ClassList.Remove(rteforecolor); if (!string.IsNullOrEmpty(newClass)) { // We mapped a color element.ClassList.Add(newClass); } } // <sub class="ms-rteBackColor-2">lowerscript</sub> var rtebackcolor = element.ClassList.PartialMatch("ms-rtebackcolor-"); if (!string.IsNullOrEmpty(rtebackcolor)) { // Modern Theme colors // Dark Red, Red, Orange, Yellow, Light green // Green, Light Blue, Blue, Dark Blue, Purple string newClass = null; if (int.TryParse(rtebackcolor.ToLower().Replace("ms-rtebackcolor-", ""), out int colorCode)) { string colorName = ColorCodeToBackgroundColorName(colorCode); if (!string.IsNullOrEmpty(colorName)) { newClass = $"highlightColor{colorName}"; } } element.ClassList.Remove(rtebackcolor); if (!string.IsNullOrEmpty(newClass)) { // We mapped a color element.ClassList.Add(newClass); } } // ================================ // rewrite font size // ================================ var rtefontsize = element.ClassList.PartialMatch("ms-rtefontsize-"); if (!string.IsNullOrEmpty(rtefontsize)) { // Modern Theme colors // Dark Red, Red, Orange, Yellow, Light green // Green, Light Blue, Blue, Dark Blue, Purple string newClass = null; if (int.TryParse(rtefontsize.ToLower().Replace("ms-rtefontsize-", ""), out int fontsizeCode)) { string fontSize = FontCodeToName(fontsizeCode); if (!string.IsNullOrEmpty(fontSize)) { newClass = $"fontSize{fontSize}"; } } element.ClassList.Remove(rtefontsize); if (!string.IsNullOrEmpty(newClass)) { // We mapped a color element.ClassList.Add(newClass); } } // rewrite striked and underline // <span style="text-decoration:line-through;">striked</span> // <span style="text-decoration:underline;">underline</span> bool replacementDone = false; if (IsStrikeThrough(element)) { var newElement = document.CreateElement("s"); newElement.InnerHtml = element.OuterHtml; parent.ReplaceChild(newElement, element); replacementDone = true; } else if (IsUnderline(element)) { var newElement = document.CreateElement("u"); newElement.InnerHtml = element.OuterHtml; parent.ReplaceChild(newElement, element); replacementDone = true; } // No need to wrap a span into a new span if (element is IHtmlSpanElement) { // if we still did not replace the span element and the span has no classes set anymore then we can replace it by text if (!replacementDone && element.ClassList.Length == 0) { ReplaceChildElementByText(parent, element, document); } } else if (element.TagName.Equals("strong", StringComparison.InvariantCultureIgnoreCase)) { // do nothing special here } else { // Non span element with styling that was transformed will be wrapped in a span containing the styling which wraps a "clean" element var newElement = document.CreateElement("span"); newElement.ClassList.Add(element.ClassList.ToArray()); element.ClassList.Remove(element.ClassList.ToArray()); newElement.InnerHtml = element.OuterHtml; parent.ReplaceChild(newElement, element); } } }
public static IElement Create(string name) { return(document.CreateElement(name)); }
/// <summary> /// Find all <noscript> that are located after <img> nodes, and which contain /// only one single<img> element. Replace the first image from inside the /// <noscript> tag and remove the <noscript> tag. This improves the quality of the /// images we use on some sites (e.g.Medium) /// </summary> /// <param name="doc">The document to operate on</param> internal static void UnwrapNoscriptImages(IHtmlDocument doc) { // Find img without source or attributes that might contains image, and remove it. // This is done to prevent a placeholder img is replaced by img from noscript in next step. var imgs = doc.GetElementsByTagName("img"); ForEachNode(imgs, (img) => { if (img is IElement) { for (var i = 0; i < (img as IElement).Attributes.Length; i++) { var attr = (img as IElement).Attributes[i]; switch (attr.Name) { case "src": case "srcset": case "data-src": case "data-srcset": return; } if (Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)")) { return; } } img.Parent.RemoveChild(img); } }); // Next find noscript and try to extract its image var noscripts = doc.GetElementsByTagName("noscript"); ForEachNode(noscripts, (noscript) => { if (noscript is IElement) { // Parse content of noscript and make sure it only contains image var tmp = doc.CreateElement("div"); tmp.InnerHtml = (noscript as IElement).InnerHtml; if (!IsSingleImage(tmp)) { return; } // If noscript has previous sibling and it only contains image, // replace it with noscript content. However we also keep old // attributes that might contains image. var prevElement = (noscript as IElement).PreviousElementSibling; if (prevElement != null && IsSingleImage(prevElement)) { var prevImg = prevElement; if (prevImg.TagName != "IMG") { prevImg = prevElement.GetElementsByTagName("img")[0]; } var newImg = tmp.GetElementsByTagName("img")[0]; for (var i = 0; i < prevImg.Attributes.Length; i++) { var attr = prevImg.Attributes[i]; if (attr.Value == "") { continue; } if (attr.Name == "src" || attr.Name == "srcset" || Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)")) { if (newImg.GetAttribute(attr.Name) == attr.Value) { continue; } var attrName = attr.Name; if (newImg.HasAttribute(attrName)) { attrName = "data-old-" + attrName; } newImg.SetAttribute(attrName, attr.Value); } } noscript.Parent.ReplaceChild(tmp.FirstElementChild, prevElement); } } }); }
protected virtual void TransformSpans(IHtmlCollection <IElement> spans, IHtmlDocument document) { foreach (var span in spans) { var parent = span.Parent; // rewrite normal style // <span class="ms-rteStyle-Normal">Norm</span> if (span.ClassName != null && span.ClassName.ToLower().Contains("ms-rtestyle-normal")) { ReplaceChildElementByText(parent, span, document); continue; } // rewrite striked // <span style="text-decoration:line-through;">striked</span> if (span.OuterHtml.ToLower().Contains("text-decoration:line-through;")) { var newElement = document.CreateElement("s"); newElement.TextContent = span.InnerHtml; parent.ReplaceChild(newElement, span); continue; } // rewrite underline // <span style="text-decoration:underline;">underline</span> if (span.OuterHtml.ToLower().Contains("text-decoration:underline;")) { var newElement = document.CreateElement("u"); newElement.TextContent = span.InnerHtml; parent.ReplaceChild(newElement, span); continue; } // ================================ // rewrite colors // ================================ // <span class="ms-rteThemeForeColor-5-0">red</span> if (span.ClassName != null && (span.ClassName.ToLower().StartsWith("ms-rtethemeforecolor-"))) { string newClass = null; // Modern Theme colors // Darker, Dark, Dark Alternate, Primary, Secondary // Neutral Tertiary, Neutral Secondary, Primary alternate, Neutral primary, Neutral Dark if (int.TryParse(span.ClassName.ToLower()[span.ClassName.ToLower().Length - 1].ToString(), out int themeCode)) { string colorName = ThemeCodeToForegroundColorName(themeCode); if (!string.IsNullOrEmpty(colorName)) { newClass = $"fontColor{colorName}"; } } if (!string.IsNullOrEmpty(newClass)) { // We mapped a color span.ClassName = newClass; continue; } else { // For now drop the color span ReplaceChildElementByText(parent, span, document); continue; } } // <span class="ms-rteThemeBackColor-5-0">red</span> if (span.ClassName != null && span.ClassName.ToLower().StartsWith("ms-rtethemebackcolor-")) { // There are no themed back colors in modern, so for now drop the color span and the background color ReplaceChildElementByText(parent, span, document); continue; } //<span class="ms-rteForeColor-2" style="">Red, </span> if (span.ClassName != null && span.ClassName.ToLower().StartsWith("ms-rteforecolor-")) { // Modern Theme colors // Dark Red, Red, Orange, Yellow, Light green // Green, Light Blue, Blue, Dark Blue, Purple string newClass = null; if (int.TryParse(span.ClassName.ToLower().Replace("ms-rteforecolor-", ""), out int colorCode)) { string colorName = ColorCodeToForegroundColorName(colorCode); if (!string.IsNullOrEmpty(colorName)) { newClass = $"fontColor{colorName}"; } } if (!string.IsNullOrEmpty(newClass)) { // We mapped a color span.ClassName = newClass; continue; } else { // Let's go to default...meaning drop color info ReplaceChildElementByText(parent, span, document); continue; } } if (span.ClassName != null && span.ClassName.ToLower().StartsWith("ms-rtebackcolor-")) { // Modern Theme colors // Dark Red, Red, Orange, Yellow, Light green // Green, Light Blue, Blue, Dark Blue, Purple string newClass = null; if (int.TryParse(span.ClassName.ToLower().Replace("ms-rtebackcolor-", ""), out int colorCode)) { string colorName = ColorCodeToBackgroundColorName(colorCode); if (!string.IsNullOrEmpty(colorName)) { newClass = $"highlightColor{colorName}"; } } if (!string.IsNullOrEmpty(newClass)) { // We mapped a color span.ClassName = newClass; continue; } else { // Let's go to default...meaning drop color info ReplaceChildElementByText(parent, span, document); continue; } } // ================================ // rewrite font size // ================================ if (span.ClassName != null && span.ClassName.ToLower().StartsWith("ms-rtefontsize-")) { // Modern Theme colors // Dark Red, Red, Orange, Yellow, Light green // Green, Light Blue, Blue, Dark Blue, Purple string newClass = null; if (int.TryParse(span.ClassName.ToLower().Replace("ms-rtefontsize-", ""), out int fontsizeCode)) { string fontSize = FontCodeToName(fontsizeCode); if (!string.IsNullOrEmpty(fontSize)) { newClass = $"fontSize{fontSize}"; } } if (!string.IsNullOrEmpty(newClass)) { // We mapped a color span.ClassName = newClass; continue; } else { // Let's go to default...meaning font size info will be dropped ReplaceChildElementByText(parent, span, document); continue; } } } }
/// <summary> /// Converts each <a> and <img> uri in the given element, and its descendants, to an absolute URI, /// ignoring #ref URIs. /// </summary> /// <param name="articleContent">The node in which to fix all relative uri</param> /// <param name="uri">The base uri</param> /// <param name="doc">The document to operate on</param> internal static void FixRelativeUris(IElement articleContent, Uri uri, IHtmlDocument doc) { var scheme = uri.Scheme; var prePath = uri.GetBase(); var pathBase = uri.Scheme + "://" + uri.Host + uri.AbsolutePath.Substring(0, uri.AbsolutePath.LastIndexOf('/') + 1); var links = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "a" }); NodeUtility.ForEachNode(links, (link) => { var href = (link as IElement).GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.IndexOf("javascript:") == 0) { // if the link only contains simple text content, it can be converted to a text node if (link.ChildNodes.Length == 1 && link.ChildNodes[0].NodeType == NodeType.Text) { var text = doc.CreateTextNode(link.TextContent); link.Parent.ReplaceChild(text, link); } else { // if the link has multiple children, they should all be preserved var container = doc.CreateElement("span"); while (link.ChildNodes.Length > 0) { container.AppendChild(link.ChildNodes[0]); } link.Parent.ReplaceChild(container, link); } } else { (link as IElement).SetAttribute("href", uri.ToAbsoluteURI(href)); } } }); var medias = NodeUtility.GetAllNodesWithTag(articleContent, new string[] { "img", "picture", "figure", "video", "audio", "source" }); NodeUtility.ForEachNode(medias, (media_node) => { if (media_node is IElement) { var media = media_node as IElement; var src = media.GetAttribute("src"); var poster = media.GetAttribute("poster"); var srcset = media.GetAttribute("srcset"); if (src != null) { media.SetAttribute("src", uri.ToAbsoluteURI(src)); } if (poster != null) { media.SetAttribute("poster", uri.ToAbsoluteURI(poster)); } if (srcset != null) { var newSrcset = RE_SrcSetUrl.Replace(srcset, (input) => { return(uri.ToAbsoluteURI(input.Groups[1].Value) + (input.Groups[2]?.Value ?? "") + input.Groups[3].Value); }); media.SetAttribute("srcset", newSrcset); } } }); }