A specialized sanitizer which outputs formatted plain text instead of HTML.
The renderer removes extra whitespace, and converts block level HTML elements to lines of plain text.
Inheritance: Sanitizer
Exemplo n.º 1
0
        /// <summary>
        /// Converts HTML to plain text.
        /// </summary>
        /// <remarks>
        /// Warning, the returned text may contain HTML fragments!!!
        /// For example, the original HTML code is:
        /// <code>
        /// &lt;p&gt;&amp;lt;hello&amp;lt;&gt;/p&gt;
        /// </code>
        /// Then, the converted text will be:
        /// <code>
        /// &lt;hello&gt;
        /// </code>
        /// So if you intend to output the stripped text in HTML, you need encode it first!
        /// </remarks>
        /// <param name="input">Input HTML. May be <c>null</c>.</param>
        /// <returns>Plain text, may contain angle braces!</returns>
        public static string RenderToPlainText(string input)
        {
            if (input == null)
            {
                return(null);
            }

            var inputReader  = new StringReader(input);
            var resultWriter = new StringWriter();

            resultWriter.GetStringBuilder().EnsureCapacity(input.Length);
            resultWriter.NewLine = "\n";
            var textRenderer = new PlainTextRenderer();

            textRenderer.Sanitize(inputReader, resultWriter);
            return(resultWriter.ToString());
        }
Exemplo n.º 2
0
        public static string Highlight(string value, string searchPattern)
        {
            value = PlainTextRenderer.RenderToPlainText(value) ?? "";

            if (StringUtils.IsBlank(searchPattern))
            {
                return(HttpUtility.HtmlEncode(value));                // The text may contain angle braces read from html entities &lt; and &gt;, so encode it.
            }

            string[] words = StringUtils.SmartSplit(searchPattern);

            string trim = string.Join("|", words);

            trim = trim.Trim();
            string regex = string.Empty;

            foreach (char c in trim)
            {
                if (c == '|')
                {
                    regex += c;
                    continue;
                }
                if (c == '\\')
                {
                    regex += "[\\\\]";
                    continue;
                }
                if (c == '^')
                {
                    regex += "[\\^]";
                    continue;
                }

                regex += "[" + c + "]";
            }

            var             rx      = new Regex(regex, RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase);
            MatchCollection matches = rx.Matches(value);

            if (matches.Count == 0)
            {
                return(HttpUtility.HtmlEncode(value));                // The text may contain angle braces read from html entities &lt; and &gt;, so encode it.
            }

            string text = "";

            var foundMatches = new Dictionary <string, int>();

            for (int i = 0; i < matches.Count; i++)
            {
                Match match = matches[i];

                if (!foundMatches.ContainsKey(match.Value))
                {
                    foundMatches.Add(match.Value, 1);
                }
                else if (foundMatches[match.Value] < MAX_MATCH_CYCLES)
                {
                    foundMatches[match.Value] = foundMatches[match.Value]++;
                }
                else
                {
                    continue;
                }

                int start  = DefineStartPosition(match);
                int length = DefineLength(value, match, start);

                if (start == 0 && length == value.Length)
                {
                    text = value;
                    break;
                }

                text += CutText(value, start, length);
            }

            var result = new StringBuilder(text.Length * 2);

            matches = rx.Matches(text);
            int offset = 0;

            for (int i = 0; i < matches.Count; i++)
            {
                Match match = matches[i];
                result.Append(HttpUtility.HtmlEncode(text.Substring(offset, match.Index - offset)));
                result.Append("<span class=\"wordHighlight\">").Append(HttpUtility.HtmlEncode(match.Value)).Append("</span>");
                if (i == matches.Count - 1)
                {
                    result.Append(HttpUtility.HtmlEncode(text.Substring(match.Index + match.Length)));
                }
                offset = match.Index + match.Length;
            }
            return(result.ToString());
        }
		/// <summary>
		/// Converts HTML to plain text.
		/// </summary>
		/// <remarks>
		/// Warning, the returned text may contain HTML fragments!!!
		/// For example, the original HTML code is:
		/// <code>
		/// &lt;p&gt;&amp;lt;hello&amp;lt;&gt;/p&gt;
		/// </code>
		/// Then, the converted text will be:
		/// <code>
		/// &lt;hello&gt;
		/// </code>
		/// So if you intend to output the stripped text in HTML, you need encode it first!
		/// </remarks>
		/// <param name="input">Input HTML. May be <c>null</c>.</param>
		/// <returns>Plain text, may contain angle braces!</returns>
		public static string RenderToPlainText(string input)
		{
			if (input == null)
			{
				return null;
			}

			var inputReader = new StringReader(input);
			var resultWriter = new StringWriter();
			resultWriter.GetStringBuilder().EnsureCapacity(input.Length);
			resultWriter.NewLine = "\n";
			var textRenderer = new PlainTextRenderer();
			textRenderer.Sanitize(inputReader, resultWriter);
			return resultWriter.ToString();
		}