Exemple #1
0
        internal static void GetUtf16SurrogatePairFromAstralScalarValue(uint scalar, out char highSurrogate, out char lowSurrogate)
        {
            Debug.Assert(0x10000 <= scalar && scalar <= UNICODE_LAST_CODEPOINT);

            UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(scalar);

            // This calculation comes from the Unicode specification, Table 3-5.

            highSurrogate = (char)((scalar + ((0xD800u - 0x40u) << 10)) >> 10);
            lowSurrogate  = (char)((scalar & 0x3FFu) + 0xDC00u);
        }
Exemple #2
0
#pragma warning disable IDE0060 // 'this' taken explicitly to avoid argument shuffling by caller
                static int TryEncodeScalarAsHex(object @this, uint scalarValue, Span <char> destination)
#pragma warning restore IDE0060
                {
                    UnicodeDebug.AssertIsValidScalar(scalarValue);

                    // For inputs 0x0000..0x10FFFF, log2 will return 0..20.
                    // (It counts the number of bits following the highest set bit.)
                    //
                    // We divide by 4 to get the number of nibbles (this rounds down),
                    // then +1 to account for rounding effects. This also accounts for
                    // that when log2 results in an exact multiple of 4, no rounding has
                    // taken place, but we need to include a char for the preceding '0x1'.
                    // Finally, we +4 to account for the "&#x" prefix and the ";" suffix,
                    // then -1 to get the index of the last legal location we want to write to.
                    // >> +1 +4 -1 = +4

                    int idxOfSemicolon = (int)((uint)BitOperations.Log2(scalarValue) / 4) + 4;

                    Debug.Assert(4 <= idxOfSemicolon && idxOfSemicolon <= 9, "Expected '&#x0;'..'&#x10FFFF;'.");

                    if (!SpanUtility.IsValidIndex(destination, idxOfSemicolon))
                    {
                        goto OutOfSpaceInner;
                    }
                    destination[idxOfSemicolon] = ';';

                    // It's more efficient to write 4 chars at a time instead of 1 char.
                    // The '0' at the end will be overwritten.
                    if (!SpanUtility.TryWriteChars(destination, '&', '#', 'x', '0'))
                    {
                        Debug.Fail("We should've had enough room to write 4 chars.");
                    }

                    destination = destination.Slice(3, idxOfSemicolon - 3);
                    for (int i = destination.Length - 1; SpanUtility.IsValidIndex(destination, i); i--)
                    {
                        char asUpperHex = HexConverter.ToCharUpper((int)scalarValue);
                        destination[i] = asUpperHex;
                        scalarValue  >>= 4; // write a nibble - not a byte - at a time
                    }

                    return(destination.Length + 4);

OutOfSpaceInner:

                    return(-1);
                }
        /// <summary>
        /// Allows all characters specified by <paramref name="range"/> through the filter.
        /// </summary>
        public virtual void AllowRange(UnicodeRange range)
        {
            if (range is null)
            {
                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.range);
            }

            int firstCodePoint = range.FirstCodePoint;
            int rangeSize      = range.Length;

            for (int i = 0; i < rangeSize; i++)
            {
                int codePoint = firstCodePoint + i;
                UnicodeDebug.AssertIsBmpCodePoint((uint)codePoint); // UnicodeRange only supports BMP
                _allowedCodePointsBitmap.AllowChar((char)codePoint);
            }
        }
Exemple #4
0
        /// <summary>
        /// Disallows all characters specified by <paramref name="range"/> through the filter.
        /// </summary>
        public virtual void ForbidRange(UnicodeRange range)
        {
            if (range == null)
            {
                throw new ArgumentNullException(nameof(range));
            }

            int firstCodePoint = range.FirstCodePoint;
            int rangeSize      = range.Length;

            for (int i = 0; i < rangeSize; i++)
            {
                int codePoint = firstCodePoint + i;
                UnicodeDebug.AssertIsBmpCodePoint((uint)codePoint); // UnicodeRange only supports BMP
                _allowedCodePointsBitmap.ForbidChar((char)codePoint);
            }
        }
Exemple #5
0
#pragma warning disable IDE0060 // 'this' taken explicitly to avoid argument shuffling by caller
                static int TryEncodeScalarAsHex(object @this, uint scalarValue, Span <byte> destination)
#pragma warning restore IDE0060
                {
                    UnicodeDebug.AssertIsValidScalar(scalarValue);

                    // See comments in the UTF-16 equivalent method later in this file.

                    int idxOfSemicolon = (int)((uint)BitOperations.Log2(scalarValue) / 4) + 4;

                    Debug.Assert(4 <= idxOfSemicolon && idxOfSemicolon <= 9, "Expected '&#x0;'..'&#x10FFFF;'.");

                    if (!SpanUtility.IsValidIndex(destination, idxOfSemicolon))
                    {
                        goto OutOfSpaceInner;
                    }
                    destination[idxOfSemicolon] = (byte)';';

                    if (!SpanUtility.TryWriteBytes(destination, (byte)'&', (byte)'#', (byte)'x', (byte)'0'))
                    {
                        Debug.Fail("We should've had enough room to write 4 bytes.");
                    }

                    destination = destination.Slice(3, idxOfSemicolon - 3);
                    for (int i = destination.Length - 1; SpanUtility.IsValidIndex(destination, i); i--)
                    {
                        char asUpperHex = HexConverter.ToCharUpper((int)scalarValue);
                        destination[i] = (byte)asUpperHex;
                        scalarValue  >>= 4; // write a nibble - not a byte - at a time
                    }

                    return(destination.Length + 4);

OutOfSpaceInner:

                    return(-1);
                }
Exemple #6
0
        /// <summary>
        /// A copy of the logic in Rune.DecodeFromUtf8.
        /// </summary>
        public static OperationStatus DecodeScalarValueFromUtf8(ReadOnlySpan <byte> source, out uint result, out int bytesConsumed)
        {
            const char ReplacementChar = '\uFFFD';

            // This method follows the Unicode Standard's recommendation for detecting
            // the maximal subpart of an ill-formed subsequence. See The Unicode Standard,
            // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence,
            // it tries to consume as many code units as possible as long as those code
            // units constitute the beginning of a longer well-formed subsequence per Table 3-7.

            int index = 0;

            // Try reading input[0].

            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            uint tempValue = source[index];

            if (!UnicodeUtility.IsAsciiCodePoint(tempValue))
            {
                goto NotAscii;
            }

Finish:

            bytesConsumed = index + 1;
            Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
            result = tempValue;
            return(OperationStatus.Done);

NotAscii:

            // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
            // the range [C2..F4]. If it's outside of that range, it's either a standalone
            // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
            // four-byte sequence.

            if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4))
            {
                goto FirstByteInvalid;
            }

            tempValue = (tempValue - 0xC2) << 6;

            // Try reading input[1].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            // Continuation bytes are of the form [10xxxxxx], which means that their two's
            // complement representation is in the range [-65..-128]. This allows us to
            // perform a single comparison to see if a byte is a continuation byte.

            int thisByteSignExtended = (sbyte)source[index];

            if (thisByteSignExtended >= -64)
            {
                goto Invalid;
            }

            tempValue += (uint)thisByteSignExtended;
            tempValue += 0x80;               // remove the continuation byte marker
            tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker

            if (tempValue < 0x0800)
            {
                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF));
                goto Finish; // this is a valid 2-byte sequence
            }

            // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have
            // enough information (from just two code units) to detect overlong or surrogate
            // sequences, we need to perform these checks now.

            if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80)))
            {
                // The first two bytes were not in the range [[E0 A0]..[F4 8F]].
                // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence.
                goto Invalid;
            }

            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80)))
            {
                // This is a UTF-16 surrogate code point, which is invalid in UTF-8.
                goto Invalid;
            }

            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80)))
            {
                // This is an overlong 4-byte sequence.
                goto Invalid;
            }

            // The first two bytes were just fine. We don't need to perform any other checks
            // on the remaining bytes other than to see that they're valid continuation bytes.

            // Try reading input[2].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            thisByteSignExtended = (sbyte)source[index];
            if (thisByteSignExtended >= -64)
            {
                goto Invalid; // this byte is not a UTF-8 continuation byte
            }

            tempValue <<= 6;
            tempValue  += (uint)thisByteSignExtended;
            tempValue  += 0x80;                // remove the continuation byte marker
            tempValue  -= (0xE0 - 0xC0) << 12; // remove the leading byte marker

            if (tempValue <= 0xFFFF)
            {
                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF));
                goto Finish; // this is a valid 3-byte sequence
            }

            // Try reading input[3].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            thisByteSignExtended = (sbyte)source[index];
            if (thisByteSignExtended >= -64)
            {
                goto Invalid; // this byte is not a UTF-8 continuation byte
            }

            tempValue <<= 6;
            tempValue  += (uint)thisByteSignExtended;
            tempValue  += 0x80;                // remove the continuation byte marker
            tempValue  -= (0xF0 - 0xE0) << 18; // remove the leading byte marker

            UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue);
            goto Finish; // this is a valid 4-byte sequence

FirstByteInvalid:

            index = 1; // Invalid subsequences are always at least length 1.

Invalid:

            Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
            bytesConsumed = index;
            result        = ReplacementChar;
            return(OperationStatus.InvalidData);

NeedsMoreData:

            Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
            bytesConsumed = index;
            result        = ReplacementChar;
            return(OperationStatus.NeedMoreData);
        }
Exemple #7
0
 private static void _GetIndexAndOffset(uint value, out nuint index, out int offset)
 {
     UnicodeDebug.AssertIsBmpCodePoint(value);
     index  = value >> 5;
     offset = (int)value & 0x1F;
 }