/// <summary>
        /// Encodes input strings for use in HTML.
        /// </summary>
        /// <param name="input">String to be encoded</param>
        /// <param name="useNamedEntities">Value indicating if the HTML 4.0 named entities should be used.</param>
        /// <param name="encoderTweak">A <see cref="MethodSpecificEncoder"/> function, if needed.</param>
        /// <returns>
        /// Encoded string for use in HTML.
        /// </returns>
        private static string HtmlEncode(string input, bool useNamedEntities, MethodSpecificEncoder encoderTweak)
        {
            if (string.IsNullOrEmpty(input))
            {
                return(input);
            }

            if (characterValues == null)
            {
                InitialiseSafeList();
            }

            char[][] namedEntities = null;
            if (useNamedEntities)
            {
                namedEntities = namedEntitiesLazy.Value;
            }

            // Setup a new StringBuilder for output.
            // Worse case scenario - the longest entity name, thetasym is 10 characters, including the & and ;.
            StringBuilder builder = EncoderUtil.GetOutputStringBuilder(input.Length, 10);

            AcquireReadLock();
            try
            {
                Utf16StringReader stringReader = new Utf16StringReader(input);
                while (true)
                {
                    int currentCodePoint = stringReader.ReadNextScalarValue();
                    if (currentCodePoint < 0)
                    {
                        break; // EOF
                    }

                    if (currentCodePoint > char.MaxValue)
                    {
                        // We don't have a pre-generated mapping of characters beyond the Basic Multilingual
                        // Plane (BMP), so we need to generate these encodings on-the-fly. We should encode
                        // the code point rather than the surrogate code units that make up this code point.
                        // See: http://www.w3.org/International/questions/qa-escapes#bytheway
                        char[] encodedCharacter = SafeList.HashThenValueGenerator(currentCodePoint);
                        builder.Append('&');
                        builder.Append(encodedCharacter);
                        builder.Append(';');
                    }
                    else
                    {
                        // If we reached this point, the code point is within the BMP.
                        char currentCharacter = (char)currentCodePoint;

                        if (encoderTweak != null && encoderTweak(currentCharacter, out char[] tweekedValue))
Exemple #2
0
        /// <summary>
        /// Encodes input strings for use in HTML.
        /// </summary>
        /// <param name="input">String to be encoded</param>
        /// <param name="useNamedEntities">Value indicating if the HTML 4.0 named entities should be used.</param>
        /// <param name="encoderTweak">A <see cref="MethodSpecificEncoder"/> function, if needed.</param>
        /// <returns>
        /// Encoded string for use in HTML.
        /// </returns>
        /// <exception cref="InvalidUnicodeValueException">Thrown if a character with an invalid Unicode value is encountered within the input string.</exception>
        /// <exception cref="InvalidSurrogatePairException">Thrown if a high surrogate code point is encoded without a following low surrogate code point, or a
        /// low surrogate code point is encounter without having been preceded by a high surrogate code point.</exception>
        private static string HtmlEncode(string input, bool useNamedEntities, MethodSpecificEncoder encoderTweak)
        {
            if (string.IsNullOrEmpty(input))
            {
                return(input);
            }

            if (characterValues == null)
            {
                InitialiseSafeList();
            }

            if (useNamedEntities && namedEntities == null)
            {
                InitialiseNamedEntityList();
            }

            // Setup a new character array for output.
            char[] inputAsArray = input.ToCharArray();
            int    outputLength = 0;
            int    inputLength  = inputAsArray.Length;

            char[] encodedInput = new char[inputLength * 10]; // Worse case scenario - the longest entity name, thetasym is 10 characters, including the & and ;.

            SyncLock.EnterReadLock();
            try
            {
                for (int i = 0; i < inputLength; i++)
                {
                    char   currentCharacter = inputAsArray[i];
                    int    currentCodePoint = inputAsArray[i];
                    char[] tweekedValue;

                    // Check for invalid values
                    if (currentCodePoint == 0xFFFE ||
                        currentCodePoint == 0xFFFF)
                    {
                        throw new InvalidUnicodeValueException(currentCodePoint);
                    }
                    else if (char.IsHighSurrogate(currentCharacter))
                    {
                        if (i + 1 == inputLength)
                        {
                            throw new InvalidSurrogatePairException(currentCharacter, '\0');
                        }

                        // Now peak ahead and check if the following character is a low surrogate.
                        char nextCharacter = inputAsArray[i + 1];
                        char nextCodePoint = inputAsArray[i + 1];
                        if (!char.IsLowSurrogate(nextCharacter))
                        {
                            throw new InvalidSurrogatePairException(currentCharacter, nextCharacter);
                        }

                        // Look-ahead was good, so skip.
                        i++;

                        // Calculate the combined code point
                        long combinedCodePoint =
                            0x10000 + ((currentCodePoint - 0xD800) * 0x400) + (nextCodePoint - 0xDC00);
                        char[] encodedCharacter = SafeList.HashThenValueGenerator(combinedCodePoint);
                        encodedInput[outputLength++] = '&';

                        for (int j = 0; j < encodedCharacter.Length; j++)
                        {
                            encodedInput[outputLength++] = encodedCharacter[j];
                        }

                        encodedInput[outputLength++] = ';';
                    }
                    else if (char.IsLowSurrogate(currentCharacter))
                    {
                        throw new InvalidSurrogatePairException('\0', currentCharacter);
                    }
                    else if (encoderTweak != null && encoderTweak(currentCharacter, out tweekedValue))
                    {
                        for (int j = 0; j < tweekedValue.Length; j++)
                        {
                            encodedInput[outputLength++] = tweekedValue[j];
                        }
                    }
                    else if (useNamedEntities && namedEntities[currentCodePoint] != null)
                    {
                        char[] encodedCharacter = namedEntities[currentCodePoint];
                        encodedInput[outputLength++] = '&';

                        for (int j = 0; j < encodedCharacter.Length; j++)
                        {
                            encodedInput[outputLength++] = encodedCharacter[j];
                        }

                        encodedInput[outputLength++] = ';';
                    }
                    else if (characterValues[currentCodePoint] != null)
                    {
                        // character needs to be encoded
                        char[] encodedCharacter = characterValues[currentCodePoint];
                        encodedInput[outputLength++] = '&';

                        for (int j = 0; j < encodedCharacter.Length; j++)
                        {
                            encodedInput[outputLength++] = encodedCharacter[j];
                        }

                        encodedInput[outputLength++] = ';';
                    }
                    else
                    {
                        // character does not need encoding
                        encodedInput[outputLength++] = currentCharacter;
                    }
                }
            }
            finally
            {
                SyncLock.ExitReadLock();
            }

            return(new string(encodedInput, 0, outputLength));
        }