Example #1
0
        public static SanitizeResult sanitizer(String html, Regex allowedTags, Regex forbiddenTags)
        {
            SanitizeResult ret = new SanitizeResult();
            Stack<String> openTags = new Stack<string>();

            if (String.IsNullOrEmpty(html))
                return ret;

            List<String> tokens = tokenize(html);

            // -------------------   LOOP for every token --------------------------
            for (int i = 0; i < tokens.Count; i++)
            {
                String token = tokens[i];
                bool isAcceptedToken = false;

                Match startMatcher = tagStartPattern.Match(token);
                Match endMatcher = tagClosePattern.Match(token);

                //--------------------------------------------------------------------------------  COMMENT    <!-- ......... -->
                if (commentPattern.Match(token).Success)
                {
                    ret.val = ret.val + token + (token.EndsWith("-->") ? "" : "-->");
                    ret.invalidTags.Add(token + (token.EndsWith("-->") ? "" : "-->"));
                    continue;

                    //--------------------------------------------------------------------------------  OPEN TAG    <tag .........>
                }
                else if (startMatcher.Success)
                {

                    //tag name extraction
                    String tag = startMatcher.Groups[1].Value.ToLower();

                    //-----------------------------------------------------  FORBIDDEN TAG   <script .........>
                    if (forbiddenTags.Match(tag).Success)
                    {
                        ret.invalidTags.Add("<" + tag + ">");
                        continue;

                        // --------------------------------------------------  WELL KNOWN TAG
                    }
                    else if (allowedTags.Match(tag).Success)
                    {

                        String cleanToken = "<" + tag;
                        String tokenBody = startMatcher.Groups[2].Value;

                        //first test table consistency
                        //table tbody tfoot thead th tr td
                        if ("thead".Equals(tag) || "tbody".Equals(tag) || "tfoot".Equals(tag) || "tr".Equals(tag))
                        {
                            if (openTags.Select(t => t == "table").Count() <= 0)
                            {
                                ret.invalidTags.Add("<" + tag + ">");
                                continue;
                            }
                        }
                        else if ("td".Equals(tag) || "th".Equals(tag))
                        {
                            if (openTags.Count(t => t == "tr") <= 0)
                            {
                                ret.invalidTags.Add("<" + tag + ">");
                                continue;
                            }
                        }

                        // then test properties
                        //Match attributes = attributesPattern.Match(tokenBody);
                        var attributes = attributesPattern.Matches(tokenBody);

                        bool foundURL = false; // URL flag

                        foreach (Match attribute in attributes)
                        //while (attributes.find())
                        {
                            String attr = attribute.Groups[1].Value.ToLower();
                            String val = attribute.Groups[2].Value;

                            // we will accept href in case of <A>
                            if ("a".Equals(tag) && "href".Equals(attr))
                            {    // <a href="......">

                                try
                                {
                                    var url = new Uri(val);

                                    if (url.Scheme == Uri.UriSchemeHttp || url.Scheme == Uri.UriSchemeHttps || url.Scheme == Uri.UriSchemeMailto)
                                    {
                                        foundURL = true;
                                    }
                                    else
                                    {
                                        ret.invalidTags.Add(attr + " " + val);
                                        val = "";
                                    }
                                }
                                catch
                                {
                                    ret.invalidTags.Add(attr + " " + val);
                                    val = "";
                                }
                            }
                            else if ((tag == "img" || tag == "embed") && "src".Equals(attr))
                            { // <img src="......">
                                try
                                {
                                    var url = new Uri(val);

                                    if (url.Scheme == Uri.UriSchemeHttp || url.Scheme == Uri.UriSchemeHttps)
                                    {
                                        foundURL = true;
                                    }
                                    else
                                    {
                                        ret.invalidTags.Add(attr + " " + val);
                                        val = "";
                                    }
                                }
                                catch
                                {
                                    ret.invalidTags.Add(attr + " " + val);
                                    val = "";
                                }

                            }
                            else if ("href".Equals(attr) || "src".Equals(attr))
                            { // <tag src/href="......">   skipped
                                ret.invalidTags.Add(tag + " " + attr + " " + val);
                                continue;

                            }
                            else if (attr == "width" || attr == "height")
                            { // <tag width/height="......">
                                Regex r = new Regex("\\d+%|\\d+$");
                                if (!r.Match(val.ToLower()).Success)
                                { // test numeric values
                                    ret.invalidTags.Add(tag + " " + attr + " " + val);
                                    continue;
                                }

                            }
                            else if ("style".Equals(attr))
                            { // <tag style="......">

                                // then test properties
                                var styles = stylePattern.Matches(val);
                                String cleanStyle = "";

                                foreach (Match style in styles)
                                //while (styles.find())
                                {
                                    String styleName = style.Groups[1].Value.ToLower();
                                    String styleValue = style.Groups[2].Value;

                                    // suppress invalid styles values
                                    if (forbiddenStylePattern.Match(styleValue).Success)
                                    {
                                        ret.invalidTags.Add(tag + " " + attr + " " + styleValue);
                                        continue;
                                    }

                                    // check if valid url
                                    Match urlStyleMatcher = urlStylePattern.Match(styleValue);
                                    if (urlStyleMatcher.Success)
                                    {
                                        try
                                        {
                                            String url = urlStyleMatcher.Groups[1].Value;
                                            var uri = new Uri(url);

                                            if (!(uri.Scheme == Uri.UriSchemeHttp || uri.Scheme == Uri.UriSchemeHttps))
                                            {
                                                ret.invalidTags.Add(tag + " " + attr + " " + styleValue);
                                                continue;
                                            }
                                        }
                                        catch
                                        {
                                            ret.invalidTags.Add(tag + " " + attr + " " + styleValue);
                                            continue;
                                        }

                                    }

                                    cleanStyle = cleanStyle + styleName + ":" + encode(styleValue) + ";";

                                }
                                val = cleanStyle;

                            }
                            else if (attr.StartsWith("on"))
                            {  // skip all javascript events
                                ret.invalidTags.Add(tag + " " + attr + " " + val);
                                continue;

                            }
                            else
                            {  // by default encode all properies
                                val = encode(val);
                            }

                            cleanToken = cleanToken + " " + attr + "=\"" + val + "\"";
                        }
                        if (selfClosed.Match(token).Success)
                            cleanToken = cleanToken + "/>";
                        else
                            cleanToken = cleanToken + ">";

                        isAcceptedToken = true;

                        // for <img> and <a>
                        if ((tag == "a" || tag == "img" || tag == "embed") && !foundURL)
                        {
                            isAcceptedToken = false;
                            cleanToken = "";
                        }

                        token = cleanToken;

                        // push the tag if require closure and it is accepted (otherwise is encoded)
                        if (isAcceptedToken && !(standAloneTags.Match(tag).Success || selfClosed.Match(token).Success))
                            openTags.Push(tag);

                        // --------------------------------------------------------------------------------  UNKNOWN TAG
                    }
                    else
                    {
                        ret.invalidTags.Add(token);
                        ret.val = ret.val + token;
                        continue;

                    }

                    // --------------------------------------------------------------------------------  CLOSE TAG </tag>
                }
                else if (endMatcher.Success)
                {
                    String tag = endMatcher.Groups[1].Value.ToLower();

                    //is self closing
                    if (selfClosed.Match(tag).Success)
                    {
                        ret.invalidTags.Add(token);
                        continue;
                    }
                    if (forbiddenTags.Match(tag).Success)
                    {
                        ret.invalidTags.Add("/" + tag);
                        continue;
                    }
                    if (!allowedTags.Match(tag).Success)
                    {
                        ret.invalidTags.Add(token);
                        ret.val = ret.val + token;
                        continue;
                    }
                    else
                    {

                        String cleanToken = "";

                        // check tag position in the stack

                        int pos = -1;
                        bool found = false;

                        foreach (var item in openTags)
                        {
                            pos++;
                            if (item == tag)
                            {
                                found = true;
                                break;
                            }
                        }

                        // if found on top ok
                        if (found)
                        {
                            for (int k = 0; k <= pos; k++)
                            {
                                //pop all elements before tag and close it
                                String poppedTag = openTags.Pop();
                                cleanToken = cleanToken + "</" + poppedTag + ">";
                                isAcceptedToken = true;
                            }
                        }

                        token = cleanToken;
                    }

                }

                ret.val = ret.val + token;

                if (isAcceptedToken)
                {
                    ret.html = ret.html + token;
                    //ret.text = ret.text + " ";
                }
                else
                {
                    String sanToken = htmlEncodeApexesAndTags(token);
                    ret.html = ret.html + sanToken;
                    ret.text = ret.text + htmlEncodeApexesAndTags(removeLineFeed(token));
                }

            }

            // must close remaining tags
            while (openTags.Count() > 0)
            {
                //pop all elements before tag and close it
                String poppedTag = openTags.Pop();
                ret.html = ret.html + "</" + poppedTag + ">";
                ret.val = ret.val + "</" + poppedTag + ">";
            }

            //set boolean value
            ret.isValid = ret.invalidTags.Count == 0;

            return ret;
        }
Example #2
0
        public static SanitizeResult sanitizer(String html, Regex allowedTags, Regex forbiddenTags)
        {
            SanitizeResult ret      = new SanitizeResult();
            Stack <String> openTags = new Stack <string>();

            if (String.IsNullOrEmpty(html))
            {
                return(ret);
            }

            List <String> tokens = tokenize(html);

            // -------------------   LOOP for every token --------------------------
            for (int i = 0; i < tokens.Count; i++)
            {
                String token           = tokens[i];
                bool   isAcceptedToken = false;

                Match startMatcher = tagStartPattern.Match(token);
                Match endMatcher   = tagClosePattern.Match(token);

                // --------------------------------------------------------------------------------  COMMENT    <!-- ......... -->
                if (commentPattern.Match(token).Success)
                {
                    ret.val = ret.val + token + (token.EndsWith("-->") ? "" : "-->");
                    ret.invalidTags.Add(token + (token.EndsWith("-->") ? "" : "-->"));
                    continue;

                    // --------------------------------------------------------------------------------  OPEN TAG    <tag .........>
                }
                else if (startMatcher.Success)
                {
                    // tag name extraction
                    String tag = startMatcher.Groups[1].Value.ToLower();

                    // -----------------------------------------------------  FORBIDDEN TAG   <script .........>
                    if (forbiddenTags.Match(tag).Success)
                    {
                        ret.invalidTags.Add("<" + tag + ">");
                        continue;

                        // --------------------------------------------------  WELL KNOWN TAG
                    }
                    else if (allowedTags.Match(tag).Success)
                    {
                        String cleanToken = "<" + tag;
                        String tokenBody  = startMatcher.Groups[2].Value;

                        // first test table consistency
                        // table tbody tfoot thead th tr td
                        if ("thead".Equals(tag) || "tbody".Equals(tag) || "tfoot".Equals(tag) || "tr".Equals(tag))
                        {
                            if (openTags.Select(t => t == "table").Count() <= 0)
                            {
                                ret.invalidTags.Add("<" + tag + ">");
                                continue;
                            }
                        }
                        else if ("td".Equals(tag) || "th".Equals(tag))
                        {
                            if (openTags.Count(t => t == "tr") <= 0)
                            {
                                ret.invalidTags.Add("<" + tag + ">");
                                continue;
                            }
                        }

                        // then test properties
                        // Match attributes = attributesPattern.Match(tokenBody);
                        var attributes = attributesPattern.Matches(tokenBody);

                        bool foundURL = false; // URL flag

                        foreach (Match attribute in attributes)
                        // while (attributes.find())
                        {
                            String attr = attribute.Groups[1].Value.ToLower();
                            String val  = attribute.Groups[2].Value;

                            // we will accept href in case of <A>
                            if ("a".Equals(tag) && "href".Equals(attr))
                            {    // <a href="......">
                                try
                                {
                                    var url = new Uri(val);

                                    if (url.Scheme == Uri.UriSchemeHttp || url.Scheme == Uri.UriSchemeHttps || url.Scheme == Uri.UriSchemeMailto)
                                    {
                                        foundURL = true;
                                    }
                                    else
                                    {
                                        ret.invalidTags.Add(attr + " " + val);
                                        val = "";
                                    }
                                }
                                catch
                                {
                                    // -- invalid uri maybe is a relative url
                                    // ret.invalidTags.Add(attr + " " + val);
                                    // val = "";
                                    foundURL = true;
                                }
                            }
                            else if ((tag == "img" || tag == "embed") && "src".Equals(attr))
                            { // <img src="......">
                                try
                                {
                                    var url = new Uri(val);

                                    if (url.Scheme == Uri.UriSchemeHttp || url.Scheme == Uri.UriSchemeHttps)
                                    {
                                        foundURL = true;
                                    }
                                    else
                                    {
                                        ret.invalidTags.Add(attr + " " + val);
                                        val = "";
                                    }
                                }
                                catch
                                {
                                    // -- invalid uri maybe is a relative url
                                    // ret.invalidTags.Add(attr + " " + val);
                                    // val = "";
                                    foundURL = true;
                                }
                            }
                            else if ("href".Equals(attr) || "src".Equals(attr))
                            { // <tag src/href="......">   skipped
                                ret.invalidTags.Add(tag + " " + attr + " " + val);
                                continue;
                            }
                            else if (attr == "width" || attr == "height")
                            { // <tag width/height="......">
                                Regex r = new Regex("\\d+%|\\d+$");
                                if (!r.Match(val.ToLower()).Success)
                                { // test numeric values
                                    ret.invalidTags.Add(tag + " " + attr + " " + val);
                                    continue;
                                }
                            }
                            else if ("style".Equals(attr))
                            { // <tag style="......">
                                // then test properties
                                var    styles     = stylePattern.Matches(val);
                                String cleanStyle = "";

                                foreach (Match style in styles)
                                // while (styles.find())
                                {
                                    String styleName  = style.Groups[1].Value.ToLower();
                                    String styleValue = style.Groups[2].Value;

                                    // suppress invalid styles values
                                    if (forbiddenStylePattern.Match(styleValue).Success)
                                    {
                                        ret.invalidTags.Add(tag + " " + attr + " " + styleValue);
                                        continue;
                                    }

                                    // check if valid url
                                    Match urlStyleMatcher = urlStylePattern.Match(styleValue);
                                    if (urlStyleMatcher.Success)
                                    {
                                        try
                                        {
                                            String url = urlStyleMatcher.Groups[1].Value;
                                            var    uri = new Uri(url);

                                            if (!(uri.Scheme == Uri.UriSchemeHttp || uri.Scheme == Uri.UriSchemeHttps))
                                            {
                                                ret.invalidTags.Add(tag + " " + attr + " " + styleValue);
                                                continue;
                                            }
                                        }
                                        catch
                                        {
                                            ret.invalidTags.Add(tag + " " + attr + " " + styleValue);
                                            continue;
                                        }
                                    }

                                    cleanStyle = cleanStyle + styleName + ":" + encode(styleValue) + ";";
                                }
                                val = cleanStyle;
                            }
                            else if (attr.StartsWith("on"))
                            {  // skip all javascript events
                                ret.invalidTags.Add(tag + " " + attr + " " + val);
                                continue;
                            }
                            else
                            {  // by default encode all properies
                                val = encode(val);
                            }

                            cleanToken = cleanToken + " " + attr + "=\"" + val + "\"";
                        }
                        if (selfClosed.Match(token).Success)
                        {
                            cleanToken = cleanToken + "/>";
                        }
                        else
                        {
                            cleanToken = cleanToken + ">";
                        }

                        isAcceptedToken = true;

                        // for <img> and <a>
                        if ((tag == "a" || tag == "img" || tag == "embed") && !foundURL)
                        {
                            isAcceptedToken = false;
                            cleanToken      = "";
                        }

                        token = cleanToken;

                        // push the tag if require closure and it is accepted (otherwise is encoded)
                        if (isAcceptedToken && !(standAloneTags.Match(tag).Success || selfClosed.Match(token).Success))
                        {
                            openTags.Push(tag);
                        }

                        // --------------------------------------------------------------------------------  UNKNOWN TAG
                    }
                    else
                    {
                        ret.invalidTags.Add(token);
                        ret.val = ret.val + token;
                        continue;
                    }

                    // --------------------------------------------------------------------------------  CLOSE TAG </tag>
                }
                else if (endMatcher.Success)
                {
                    String tag = endMatcher.Groups[1].Value.ToLower();

                    // is self closing
                    if (selfClosed.Match(tag).Success)
                    {
                        ret.invalidTags.Add(token);
                        continue;
                    }
                    if (forbiddenTags.Match(tag).Success)
                    {
                        ret.invalidTags.Add("/" + tag);
                        continue;
                    }
                    if (!allowedTags.Match(tag).Success)
                    {
                        ret.invalidTags.Add(token);
                        ret.val = ret.val + token;
                        continue;
                    }
                    else
                    {
                        String cleanToken = "";

                        // check tag position in the stack

                        int  pos   = -1;
                        bool found = false;

                        foreach (var item in openTags)
                        {
                            pos++;
                            if (item == tag)
                            {
                                found = true;
                                break;
                            }
                        }

                        // if found on top ok
                        if (found)
                        {
                            for (int k = 0; k <= pos; k++)
                            {
                                // pop all elements before tag and close it
                                String poppedTag = openTags.Pop();
                                cleanToken      = cleanToken + "</" + poppedTag + ">";
                                isAcceptedToken = true;
                            }
                        }

                        token = cleanToken;
                    }
                }

                ret.val = ret.val + token;

                if (isAcceptedToken)
                {
                    ret.html = ret.html + token;
                    // ret.text = ret.text + " ";
                }
                else
                {
                    String sanToken = htmlEncodeApexesAndTags(token);
                    ret.html = ret.html + sanToken;
                    ret.text = ret.text + htmlEncodeApexesAndTags(removeLineFeed(token));
                }
            }

            // must close remaining tags
            while (openTags.Count() > 0)
            {
                // pop all elements before tag and close it
                String poppedTag = openTags.Pop();
                ret.html = ret.html + "</" + poppedTag + ">";
                ret.val  = ret.val + "</" + poppedTag + ">";
            }

            // set boolean value
            ret.isValid = ret.invalidTags.Count == 0;

            return(ret);
        }