/// <summary> /// Performs post processing on all nodes in the document. /// </summary> /// <param name="dom">The HTML document.</param> /// <param name="context">The node within which to post process all nodes.</param> private void DoPostProcess(IHtmlDocument dom, INode context) { if (PostProcessNode != null) { dom.Normalize(); var nodes = GetAllNodes(context).ToList(); foreach (var node in nodes) { var e = new PostProcessNodeEventArgs { Document = dom, Node = node }; OnPostProcessNode(e); if (e.ReplacementNodes.Any()) { ((IChildNode)node).Replace(e.ReplacementNodes.ToArray()); } } } if (PostProcessDom != null) { var e = new PostProcessDomEventArgs { Document = dom }; OnPostProcessDom(e); } }
/// <summary> /// Raises the <see cref="E:PostProcessNode" /> event. /// </summary> /// <param name="e">The <see cref="PostProcessNodeEventArgs"/> instance containing the event data.</param> protected virtual void OnPostProcessNode(PostProcessNodeEventArgs e) { if (PostProcessNode != null) { PostProcessNode(this, e); } }
/// <summary> /// Performs post processing on all nodes in the document. /// </summary> /// <param name="dom">The HTML document.</param> /// <param name="nodes">The list of nodes in the document.</param> private void DoPostProcess(IHtmlDocument dom, List <INode> nodes) { if (PostProcessNode != null) { foreach (var node in nodes) { var e = new PostProcessNodeEventArgs { Document = dom, Node = node }; OnPostProcessNode(e); if (e.ReplacementNodes.Any()) { ((IChildNode)node).Replace(e.ReplacementNodes.ToArray()); } } } }
/// <summary> /// Raises the <see cref="E:PostProcessNode" /> event. /// </summary> /// <param name="e">The <see cref="PostProcessNodeEventArgs"/> instance containing the event data.</param> protected virtual void OnPostProcessNode(PostProcessNodeEventArgs e) { PostProcessNode?.Invoke(this, e); }
/// <summary> /// Raises the <see cref="E:PostProcessNode" /> event. /// </summary> /// <param name="e">The <see cref="PostProcessNodeEventArgs"/> instance containing the event data.</param> protected virtual void OnPostProcessNode(PostProcessNodeEventArgs e) { if (PostProcessNode != null) PostProcessNode(this, e); }
/// <summary> /// Sanitizes the specified HTML. /// </summary> /// <param name="html">The HTML to sanitize.</param> /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param> /// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param> /// <returns>The sanitized HTML.</returns> public string Sanitize(string html, string baseUrl = "", IOutputFormatter outputFormatter = null) { var dom = CQ.Create(html); // remove non-whitelisted tags foreach (var tag in dom["*"].Where(t => !IsAllowedTag(t)).ToList()) { RemoveTag(tag); } // cleanup attributes foreach (var tag in dom["*"].ToList()) { // remove non-whitelisted attributes foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList()) { RemoveAttribute(tag, attribute); } // sanitize URLs in URL-marked attributes foreach (var attribute in tag.Attributes.Where(IsUriAttribute).ToList()) { var url = SanitizeUrl(attribute.Value, baseUrl); if (url == null) RemoveAttribute(tag, attribute); else tag.SetAttribute(attribute.Key, url); } // sanitize the style attribute SanitizeStyle(tag.Style, baseUrl); // sanitize the value of the attributes foreach (var attribute in tag.Attributes.ToList()) { // The '& Javascript include' is a possible method to execute Javascript and can lead to XSS. // (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes) if (attribute.Value.Contains("&{")) RemoveAttribute(tag, attribute); else { // escape attribute value var val = attribute.Value.Replace("<", "<").Replace(">", ">"); tag.SetAttribute(attribute.Key, val); } } } if (PostProcessNode != null) { var nodes = GetAllNodes(dom).ToList(); foreach (var node in nodes) { var e = new PostProcessNodeEventArgs { Node = node }; OnPostProcessNode(e); if (e.ReplacementNodes.Any()) dom[node].ReplaceWith(e.ReplacementNodes); } } if (outputFormatter == null) outputFormatter = new FormatDefault(DomRenderingOptions.RemoveComments | DomRenderingOptions.QuoteAllAttributes, HtmlEncoders.Default); var output = dom.Render(outputFormatter); return output; }
/// <summary> /// Sanitizes the specified HTML. /// </summary> /// <param name="html">The HTML to sanitize.</param> /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param> /// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param> /// <returns>The sanitized HTML.</returns> public string Sanitize(string html, string baseUrl = "", IOutputFormatter outputFormatter = null) { var dom = CQ.Create(html); // remove non-whitelisted tags foreach (var tag in dom["*"].Where(t => !IsAllowedTag(t)).ToList()) { RemoveTag(tag); } // cleanup attributes foreach (var tag in dom["*"].ToList()) { // remove non-whitelisted attributes foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList()) { RemoveAttribute(tag, attribute); } // sanitize URLs in URL-marked attributes foreach (var attribute in tag.Attributes.Where(IsUriAttribute).ToList()) { var url = SanitizeUrl(attribute.Value, baseUrl); if (url == null) { RemoveAttribute(tag, attribute); } else { tag.SetAttribute(attribute.Key, url); } } // sanitize the style attribute SanitizeStyle(tag.Style, baseUrl); // sanitize the value of the attributes foreach (var attribute in tag.Attributes.ToList()) { // The '& Javascript include' is a possible method to execute Javascript and can lead to XSS. // (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes) if (attribute.Value.Contains("&{")) { RemoveAttribute(tag, attribute); } else { // escape attribute value var val = attribute.Value.Replace("<", "<").Replace(">", ">"); tag.SetAttribute(attribute.Key, val); } } } if (PostProcessNode != null) { var nodes = GetAllNodes(dom).ToList(); foreach (var node in nodes) { var e = new PostProcessNodeEventArgs { Node = node }; OnPostProcessNode(e); if (e.ReplacementNodes.Any()) { dom[node].ReplaceWith(e.ReplacementNodes); } } } if (outputFormatter == null) { outputFormatter = new FormatDefault(DomRenderingOptions.RemoveComments | DomRenderingOptions.QuoteAllAttributes, HtmlEncoders.Default); } var output = dom.Render(outputFormatter); return(output); }
/// <summary> /// Sanitizes the specified HTML. /// </summary> /// <param name="html">The HTML to sanitize.</param> /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param> /// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param> /// <returns>The sanitized HTML.</returns> public string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null) { var parser = new HtmlParser(new Configuration().WithCss(e => e.Options = new CssParserOptions { IsIncludingUnknownDeclarations = true, IsIncludingUnknownRules = true, IsToleratingInvalidConstraints = true, IsToleratingInvalidValues = true })); var dom = parser.Parse(html); // remove non-whitelisted tags foreach (var tag in dom.Body.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList()) { RemoveTag(tag, RemoveReason.NotAllowedTag); } // cleanup attributes foreach (var tag in dom.Body.QuerySelectorAll("*").OfType<IHtmlElement>().ToList()) { // remove non-whitelisted attributes foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList()) { RemoveAttribute(tag, attribute, RemoveReason.NotAllowedAttribute); } // sanitize URLs in URL-marked attributes foreach (var attribute in tag.Attributes.Where(IsUriAttribute).ToList()) { var url = SanitizeUrl(attribute.Value, baseUrl); if (url == null) RemoveAttribute(tag, attribute, RemoveReason.NotAllowedUrlValue); else tag.SetAttribute(attribute.Name, url); } // sanitize the style attribute SanitizeStyle(tag, baseUrl); // sanitize the value of the attributes foreach (var attribute in tag.Attributes.ToList()) { // The '& Javascript include' is a possible method to execute Javascript and can lead to XSS. // (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes) if (attribute.Value.Contains("&{")) RemoveAttribute(tag, attribute, RemoveReason.NotAllowedValue); else { // escape attribute value var val = attribute.Value.Replace("<", "<").Replace(">", ">"); tag.SetAttribute(attribute.Name, val); } } } var nodes = GetAllNodes(dom.Body).ToList(); foreach (var comment in nodes.OfType<IComment>()) comment.Remove(); if (PostProcessNode != null) { foreach (var node in nodes) { var e = new PostProcessNodeEventArgs { Document = dom, Node = node }; OnPostProcessNode(e); if (e.ReplacementNodes.Any()) ((IChildNode)node).Replace(e.ReplacementNodes.ToArray()); } } var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance); return output; }