/// <summary> /// Takes in HTML and returns santized Html/string /// </summary> /// <param name="html"></param> /// <param name="useXssSantiser"></param> /// <returns></returns> public static string ScrubHtml(string html, bool useXssSantiser = false) { if (string.IsNullOrEmpty(html)) { return html; } // clear the flags on P so unclosed elements in P will be auto closed. HtmlNode.ElementsFlags.Remove("p"); var doc = new HtmlDocument(); doc.LoadHtml(html); var finishedHtml = html; // Embed Urls if (doc.DocumentNode != null) { // Get all the links we are going to var tags = doc.DocumentNode.SelectNodes("//a[contains(@href, 'youtube.com')]|//a[contains(@href, 'youtu.be')]|//a[contains(@href, 'vimeo.com')]|//a[contains(@href, 'screenr.com')]|//a[contains(@href, 'instagram.com')]"); if (tags != null) { // find formatting tags foreach (var item in tags) { if (item.PreviousSibling == null) { // Prepend children to parent node in reverse order foreach (var node in item.ChildNodes.Reverse()) { item.ParentNode.PrependChild(node); } } else { // Insert children after previous sibling foreach (var node in item.ChildNodes) { item.ParentNode.InsertAfter(node, item.PreviousSibling); } } // remove from tree item.Remove(); } } //Remove potentially harmful elements var nc = doc.DocumentNode.SelectNodes("//script|//link|//iframe|//frameset|//frame|//applet|//object|//embed"); if (nc != null) { foreach (var node in nc) { node.ParentNode.RemoveChild(node, false); } } //remove hrefs to java/j/vbscript URLs nc = doc.DocumentNode.SelectNodes("//a[starts-with(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'javascript')]|//a[starts-with(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'jscript')]|//a[starts-with(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'vbscript')]"); if (nc != null) { foreach (var node in nc) { node.SetAttributeValue("href", "#"); } } //remove img with refs to java/j/vbscript URLs nc = doc.DocumentNode.SelectNodes("//img[starts-with(translate(@src, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'javascript')]|//img[starts-with(translate(@src, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'jscript')]|//img[starts-with(translate(@src, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'vbscript')]"); if (nc != null) { foreach (var node in nc) { node.SetAttributeValue("src", "#"); } } //remove on<Event> handlers from all tags nc = doc.DocumentNode.SelectNodes("//*[@onclick or @onmouseover or @onfocus or @onblur or @onmouseout or @ondblclick or @onload or @onunload or @onerror]"); if (nc != null) { foreach (var node in nc) { node.Attributes.Remove("onFocus"); node.Attributes.Remove("onBlur"); node.Attributes.Remove("onClick"); node.Attributes.Remove("onMouseOver"); node.Attributes.Remove("onMouseOut"); node.Attributes.Remove("onDblClick"); node.Attributes.Remove("onLoad"); node.Attributes.Remove("onUnload"); node.Attributes.Remove("onError"); } } // remove any style attributes that contain the word expression (IE evaluates this as script) nc = doc.DocumentNode.SelectNodes("//*[contains(translate(@style, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'expression')]"); if (nc != null) { foreach (var node in nc) { node.Attributes.Remove("stYle"); } } // build a list of nodes ordered by stream position var pos = new NodePositions(doc); // browse all tags detected as not opened foreach (var error in doc.ParseErrors.Where(e => e.Code == HtmlParseErrorCode.TagNotOpened)) { // find the text node just before this error var last = pos.Nodes.OfType<HtmlTextNode>().LastOrDefault(n => n.StreamPosition < error.StreamPosition); if (last != null) { // fix the text; reintroduce the broken tag last.Text = error.SourceText.Replace("/", "") + last.Text + error.SourceText; } } finishedHtml = doc.DocumentNode.WriteTo(); } // The reason we have this option, is using the santiser with the MarkDown editor // causes problems with line breaks. if (useXssSantiser) { return SanitizerCompatibleWithForiegnCharacters(Sanitizer.GetSafeHtmlFragment(finishedHtml)); } return finishedHtml; }
/// <summary> /// Takes in HTML and returns santized Html/string /// </summary> /// <param name="html"></param> /// <param name="useXssSantiser"></param> /// <returns></returns> public static string ScrubHtml(string html, bool useXssSantiser = false) { if (string.IsNullOrEmpty(html)) { return(html); } // clear the flags on P so unclosed elements in P will be auto closed. HtmlNode.ElementsFlags.Remove("p"); var doc = new HtmlDocument(); doc.LoadHtml(html); var finishedHtml = html; // Embed Urls if (doc.DocumentNode != null) { // Get all the links we are going to var tags = doc.DocumentNode.SelectNodes("//a[contains(@href, 'youtube.com')]|//a[contains(@href, 'youtu.be')]|//a[contains(@href, 'vimeo.com')]|//a[contains(@href, 'screenr.com')]|//a[contains(@href, 'instagram.com')]"); if (tags != null) { // find formatting tags foreach (var item in tags) { if (item.PreviousSibling == null) { // Prepend children to parent node in reverse order foreach (var node in item.ChildNodes.Reverse()) { item.ParentNode.PrependChild(node); } } else { // Insert children after previous sibling foreach (var node in item.ChildNodes) { item.ParentNode.InsertAfter(node, item.PreviousSibling); } } // remove from tree item.Remove(); } } //Remove potentially harmful elements var nc = doc.DocumentNode.SelectNodes("//script|//link|//iframe|//frameset|//frame|//applet|//object|//embed"); if (nc != null) { foreach (var node in nc) { node.ParentNode.RemoveChild(node, false); } } //remove hrefs to java/j/vbscript URLs nc = doc.DocumentNode.SelectNodes("//a[starts-with(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'javascript')]|//a[starts-with(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'jscript')]|//a[starts-with(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'vbscript')]"); if (nc != null) { foreach (var node in nc) { node.SetAttributeValue("href", "#"); } } //remove img with refs to java/j/vbscript URLs nc = doc.DocumentNode.SelectNodes("//img[starts-with(translate(@src, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'javascript')]|//img[starts-with(translate(@src, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'jscript')]|//img[starts-with(translate(@src, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'vbscript')]"); if (nc != null) { foreach (var node in nc) { node.SetAttributeValue("src", "#"); } } //remove on<Event> handlers from all tags nc = doc.DocumentNode.SelectNodes("//*[@onclick or @onmouseover or @onfocus or @onblur or @onmouseout or @ondblclick or @onload or @onunload or @onerror]"); if (nc != null) { foreach (var node in nc) { node.Attributes.Remove("onFocus"); node.Attributes.Remove("onBlur"); node.Attributes.Remove("onClick"); node.Attributes.Remove("onMouseOver"); node.Attributes.Remove("onMouseOut"); node.Attributes.Remove("onDblClick"); node.Attributes.Remove("onLoad"); node.Attributes.Remove("onUnload"); node.Attributes.Remove("onError"); } } // remove any style attributes that contain the word expression (IE evaluates this as script) nc = doc.DocumentNode.SelectNodes("//*[contains(translate(@style, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'expression')]"); if (nc != null) { foreach (var node in nc) { node.Attributes.Remove("stYle"); } } // build a list of nodes ordered by stream position var pos = new NodePositions(doc); // browse all tags detected as not opened foreach (var error in doc.ParseErrors.Where(e => e.Code == HtmlParseErrorCode.TagNotOpened)) { // find the text node just before this error var last = pos.Nodes.OfType <HtmlTextNode>().LastOrDefault(n => n.StreamPosition < error.StreamPosition); if (last != null) { // fix the text; reintroduce the broken tag last.Text = error.SourceText.Replace("/", "") + last.Text + error.SourceText; } } finishedHtml = doc.DocumentNode.WriteTo(); } // The reason we have this option, is using the santiser with the MarkDown editor // causes problems with line breaks. if (useXssSantiser) { return(SanitizerCompatibleWithForiegnCharacters(Sanitizer.GetSafeHtmlFragment(finishedHtml))); } return(finishedHtml); }