示例#1
0
        /// <summary>
        /// Performs post processing on all nodes in the document.
        /// </summary>
        /// <param name="dom">The HTML document.</param>
        /// <param name="context">The node within which to post process all nodes.</param>
        private void DoPostProcess(IHtmlDocument dom, INode context)
        {
            if (PostProcessNode != null)
            {
                dom.Normalize();
                var nodes = GetAllNodes(context).ToList();

                foreach (var node in nodes)
                {
                    var e = new PostProcessNodeEventArgs {
                        Document = dom, Node = node
                    };
                    OnPostProcessNode(e);
                    if (e.ReplacementNodes.Any())
                    {
                        ((IChildNode)node).Replace(e.ReplacementNodes.ToArray());
                    }
                }
            }

            if (PostProcessDom != null)
            {
                var e = new PostProcessDomEventArgs {
                    Document = dom
                };
                OnPostProcessDom(e);
            }
        }
示例#2
0
 /// <summary>
 /// Raises the <see cref="E:PostProcessNode" /> event.
 /// </summary>
 /// <param name="e">The <see cref="PostProcessNodeEventArgs"/> instance containing the event data.</param>
 protected virtual void OnPostProcessNode(PostProcessNodeEventArgs e)
 {
     if (PostProcessNode != null)
     {
         PostProcessNode(this, e);
     }
 }
示例#3
0
 /// <summary>
 /// Performs post processing on all nodes in the document.
 /// </summary>
 /// <param name="dom">The HTML document.</param>
 /// <param name="nodes">The list of nodes in the document.</param>
 private void DoPostProcess(IHtmlDocument dom, List <INode> nodes)
 {
     if (PostProcessNode != null)
     {
         foreach (var node in nodes)
         {
             var e = new PostProcessNodeEventArgs {
                 Document = dom, Node = node
             };
             OnPostProcessNode(e);
             if (e.ReplacementNodes.Any())
             {
                 ((IChildNode)node).Replace(e.ReplacementNodes.ToArray());
             }
         }
     }
 }
示例#4
0
 /// <summary>
 /// Raises the <see cref="E:PostProcessNode" /> event.
 /// </summary>
 /// <param name="e">The <see cref="PostProcessNodeEventArgs"/> instance containing the event data.</param>
 protected virtual void OnPostProcessNode(PostProcessNodeEventArgs e)
 {
     PostProcessNode?.Invoke(this, e);
 }
示例#5
0
 /// <summary>
 /// Raises the <see cref="E:PostProcessNode" /> event.
 /// </summary>
 /// <param name="e">The <see cref="PostProcessNodeEventArgs"/> instance containing the event data.</param>
 protected virtual void OnPostProcessNode(PostProcessNodeEventArgs e)
 {
     if (PostProcessNode != null) PostProcessNode(this, e);
 }
示例#6
0
        /// <summary>
        /// Sanitizes the specified HTML.
        /// </summary>
        /// <param name="html">The HTML to sanitize.</param>
        /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
        /// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param>
        /// <returns>The sanitized HTML.</returns>
        public string Sanitize(string html, string baseUrl = "", IOutputFormatter outputFormatter = null)
        {
            var dom = CQ.Create(html);

            // remove non-whitelisted tags
            foreach (var tag in dom["*"].Where(t => !IsAllowedTag(t)).ToList())
            {
                RemoveTag(tag);
            }

            // cleanup attributes
            foreach (var tag in dom["*"].ToList())
            {
                // remove non-whitelisted attributes
                foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList())
                {
                    RemoveAttribute(tag, attribute);
                }

                // sanitize URLs in URL-marked attributes
                foreach (var attribute in tag.Attributes.Where(IsUriAttribute).ToList())
                {
                    var url = SanitizeUrl(attribute.Value, baseUrl);
                    if (url == null)
                        RemoveAttribute(tag, attribute);
                    else
                        tag.SetAttribute(attribute.Key, url);
                }

                // sanitize the style attribute
                SanitizeStyle(tag.Style, baseUrl);

                // sanitize the value of the attributes
                foreach (var attribute in tag.Attributes.ToList())
                {
                    // The '& Javascript include' is a possible method to execute Javascript and can lead to XSS.
                    // (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes)
                    if (attribute.Value.Contains("&{"))
                        RemoveAttribute(tag, attribute);
                    else
                    {
                        // escape attribute value
                        var val = attribute.Value.Replace("<", "&lt;").Replace(">", "&gt;");
                        tag.SetAttribute(attribute.Key, val);
                    }
                }
            }

            if (PostProcessNode != null)
            {
                var nodes = GetAllNodes(dom).ToList();
                foreach (var node in nodes)
                {
                    var e = new PostProcessNodeEventArgs { Node = node };
                    OnPostProcessNode(e);
                    if (e.ReplacementNodes.Any())
                        dom[node].ReplaceWith(e.ReplacementNodes);
                }
            }

            if (outputFormatter == null)
                outputFormatter = new FormatDefault(DomRenderingOptions.RemoveComments | DomRenderingOptions.QuoteAllAttributes, HtmlEncoders.Default);

            var output = dom.Render(outputFormatter);

            return output;
        }
示例#7
0
        /// <summary>
        /// Sanitizes the specified HTML.
        /// </summary>
        /// <param name="html">The HTML to sanitize.</param>
        /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
        /// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param>
        /// <returns>The sanitized HTML.</returns>
        public string Sanitize(string html, string baseUrl = "", IOutputFormatter outputFormatter = null)
        {
            var dom = CQ.Create(html);

            // remove non-whitelisted tags
            foreach (var tag in dom["*"].Where(t => !IsAllowedTag(t)).ToList())
            {
                RemoveTag(tag);
            }

            // cleanup attributes
            foreach (var tag in dom["*"].ToList())
            {
                // remove non-whitelisted attributes
                foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList())
                {
                    RemoveAttribute(tag, attribute);
                }

                // sanitize URLs in URL-marked attributes
                foreach (var attribute in tag.Attributes.Where(IsUriAttribute).ToList())
                {
                    var url = SanitizeUrl(attribute.Value, baseUrl);
                    if (url == null)
                    {
                        RemoveAttribute(tag, attribute);
                    }
                    else
                    {
                        tag.SetAttribute(attribute.Key, url);
                    }
                }

                // sanitize the style attribute
                SanitizeStyle(tag.Style, baseUrl);

                // sanitize the value of the attributes
                foreach (var attribute in tag.Attributes.ToList())
                {
                    // The '& Javascript include' is a possible method to execute Javascript and can lead to XSS.
                    // (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes)
                    if (attribute.Value.Contains("&{"))
                    {
                        RemoveAttribute(tag, attribute);
                    }
                    else
                    {
                        // escape attribute value
                        var val = attribute.Value.Replace("<", "&lt;").Replace(">", "&gt;");
                        tag.SetAttribute(attribute.Key, val);
                    }
                }
            }

            if (PostProcessNode != null)
            {
                var nodes = GetAllNodes(dom).ToList();
                foreach (var node in nodes)
                {
                    var e = new PostProcessNodeEventArgs {
                        Node = node
                    };
                    OnPostProcessNode(e);
                    if (e.ReplacementNodes.Any())
                    {
                        dom[node].ReplaceWith(e.ReplacementNodes);
                    }
                }
            }

            if (outputFormatter == null)
            {
                outputFormatter = new FormatDefault(DomRenderingOptions.RemoveComments | DomRenderingOptions.QuoteAllAttributes, HtmlEncoders.Default);
            }

            var output = dom.Render(outputFormatter);

            return(output);
        }
        /// <summary>
        /// Sanitizes the specified HTML.
        /// </summary>
        /// <param name="html">The HTML to sanitize.</param>
        /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
        /// <param name="outputFormatter">The CsQuery output formatter used to render the DOM. Using the default formatter if null.</param>
        /// <returns>The sanitized HTML.</returns>
        public string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null)
        {
            var parser = new HtmlParser(new Configuration().WithCss(e => e.Options = new CssParserOptions
            {
                IsIncludingUnknownDeclarations = true,
                IsIncludingUnknownRules = true,
                IsToleratingInvalidConstraints = true,
                IsToleratingInvalidValues = true
            }));
            var dom = parser.Parse(html);

            // remove non-whitelisted tags
            foreach (var tag in dom.Body.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList())
            {
                RemoveTag(tag, RemoveReason.NotAllowedTag);
            }

            // cleanup attributes
            foreach (var tag in dom.Body.QuerySelectorAll("*").OfType<IHtmlElement>().ToList())
            {
                // remove non-whitelisted attributes
                foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList())
                {
                    RemoveAttribute(tag, attribute, RemoveReason.NotAllowedAttribute);
                }

                // sanitize URLs in URL-marked attributes
                foreach (var attribute in tag.Attributes.Where(IsUriAttribute).ToList())
                {
                    var url = SanitizeUrl(attribute.Value, baseUrl);
                    if (url == null)
                        RemoveAttribute(tag, attribute, RemoveReason.NotAllowedUrlValue);
                    else
                        tag.SetAttribute(attribute.Name, url);
                }

                // sanitize the style attribute
                SanitizeStyle(tag, baseUrl);

                // sanitize the value of the attributes
                foreach (var attribute in tag.Attributes.ToList())
                {
                    // The '& Javascript include' is a possible method to execute Javascript and can lead to XSS.
                    // (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes)
                    if (attribute.Value.Contains("&{"))
                        RemoveAttribute(tag, attribute, RemoveReason.NotAllowedValue);
                    else
                    {
                        // escape attribute value
                        var val = attribute.Value.Replace("<", "&lt;").Replace(">", "&gt;");
                        tag.SetAttribute(attribute.Name, val);
                    }
                }
            }

            var nodes = GetAllNodes(dom.Body).ToList();

            foreach (var comment in nodes.OfType<IComment>())
                comment.Remove();

            if (PostProcessNode != null)
            {
                foreach (var node in nodes)
                {
                    var e = new PostProcessNodeEventArgs { Document = dom, Node = node };
                    OnPostProcessNode(e);
                    if (e.ReplacementNodes.Any())
                        ((IChildNode)node).Replace(e.ReplacementNodes.ToArray());
                }
            }

            var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? HtmlMarkupFormatter.Instance);

            return output;
        }