internal static void GetUtf16SurrogatePairFromAstralScalarValue(uint scalar, out char highSurrogate, out char lowSurrogate) { Debug.Assert(0x10000 <= scalar && scalar <= UNICODE_LAST_CODEPOINT); UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(scalar); // This calculation comes from the Unicode specification, Table 3-5. highSurrogate = (char)((scalar + ((0xD800u - 0x40u) << 10)) >> 10); lowSurrogate = (char)((scalar & 0x3FFu) + 0xDC00u); }
#pragma warning disable IDE0060 // 'this' taken explicitly to avoid argument shuffling by caller static int TryEncodeScalarAsHex(object @this, uint scalarValue, Span <char> destination) #pragma warning restore IDE0060 { UnicodeDebug.AssertIsValidScalar(scalarValue); // For inputs 0x0000..0x10FFFF, log2 will return 0..20. // (It counts the number of bits following the highest set bit.) // // We divide by 4 to get the number of nibbles (this rounds down), // then +1 to account for rounding effects. This also accounts for // that when log2 results in an exact multiple of 4, no rounding has // taken place, but we need to include a char for the preceding '0x1'. // Finally, we +4 to account for the "&#x" prefix and the ";" suffix, // then -1 to get the index of the last legal location we want to write to. // >> +1 +4 -1 = +4 int idxOfSemicolon = (int)((uint)BitOperations.Log2(scalarValue) / 4) + 4; Debug.Assert(4 <= idxOfSemicolon && idxOfSemicolon <= 9, "Expected '�'..''."); if (!SpanUtility.IsValidIndex(destination, idxOfSemicolon)) { goto OutOfSpaceInner; } destination[idxOfSemicolon] = ';'; // It's more efficient to write 4 chars at a time instead of 1 char. // The '0' at the end will be overwritten. if (!SpanUtility.TryWriteChars(destination, '&', '#', 'x', '0')) { Debug.Fail("We should've had enough room to write 4 chars."); } destination = destination.Slice(3, idxOfSemicolon - 3); for (int i = destination.Length - 1; SpanUtility.IsValidIndex(destination, i); i--) { char asUpperHex = HexConverter.ToCharUpper((int)scalarValue); destination[i] = asUpperHex; scalarValue >>= 4; // write a nibble - not a byte - at a time } return(destination.Length + 4); OutOfSpaceInner: return(-1); }
/// <summary> /// Allows all characters specified by <paramref name="range"/> through the filter. /// </summary> public virtual void AllowRange(UnicodeRange range) { if (range is null) { ThrowHelper.ThrowArgumentNullException(ExceptionArgument.range); } int firstCodePoint = range.FirstCodePoint; int rangeSize = range.Length; for (int i = 0; i < rangeSize; i++) { int codePoint = firstCodePoint + i; UnicodeDebug.AssertIsBmpCodePoint((uint)codePoint); // UnicodeRange only supports BMP _allowedCodePointsBitmap.AllowChar((char)codePoint); } }
/// <summary> /// Disallows all characters specified by <paramref name="range"/> through the filter. /// </summary> public virtual void ForbidRange(UnicodeRange range) { if (range == null) { throw new ArgumentNullException(nameof(range)); } int firstCodePoint = range.FirstCodePoint; int rangeSize = range.Length; for (int i = 0; i < rangeSize; i++) { int codePoint = firstCodePoint + i; UnicodeDebug.AssertIsBmpCodePoint((uint)codePoint); // UnicodeRange only supports BMP _allowedCodePointsBitmap.ForbidChar((char)codePoint); } }
#pragma warning disable IDE0060 // 'this' taken explicitly to avoid argument shuffling by caller static int TryEncodeScalarAsHex(object @this, uint scalarValue, Span <byte> destination) #pragma warning restore IDE0060 { UnicodeDebug.AssertIsValidScalar(scalarValue); // See comments in the UTF-16 equivalent method later in this file. int idxOfSemicolon = (int)((uint)BitOperations.Log2(scalarValue) / 4) + 4; Debug.Assert(4 <= idxOfSemicolon && idxOfSemicolon <= 9, "Expected '�'..''."); if (!SpanUtility.IsValidIndex(destination, idxOfSemicolon)) { goto OutOfSpaceInner; } destination[idxOfSemicolon] = (byte)';'; if (!SpanUtility.TryWriteBytes(destination, (byte)'&', (byte)'#', (byte)'x', (byte)'0')) { Debug.Fail("We should've had enough room to write 4 bytes."); } destination = destination.Slice(3, idxOfSemicolon - 3); for (int i = destination.Length - 1; SpanUtility.IsValidIndex(destination, i); i--) { char asUpperHex = HexConverter.ToCharUpper((int)scalarValue); destination[i] = (byte)asUpperHex; scalarValue >>= 4; // write a nibble - not a byte - at a time } return(destination.Length + 4); OutOfSpaceInner: return(-1); }
/// <summary> /// A copy of the logic in Rune.DecodeFromUtf8. /// </summary> public static OperationStatus DecodeScalarValueFromUtf8(ReadOnlySpan <byte> source, out uint result, out int bytesConsumed) { const char ReplacementChar = '\uFFFD'; // This method follows the Unicode Standard's recommendation for detecting // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, // it tries to consume as many code units as possible as long as those code // units constitute the beginning of a longer well-formed subsequence per Table 3-7. int index = 0; // Try reading input[0]. if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } uint tempValue = source[index]; if (!UnicodeUtility.IsAsciiCodePoint(tempValue)) { goto NotAscii; } Finish: bytesConsumed = index + 1; Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] result = tempValue; return(OperationStatus.Done); NotAscii: // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in // the range [C2..F4]. If it's outside of that range, it's either a standalone // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range // four-byte sequence. if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) { goto FirstByteInvalid; } tempValue = (tempValue - 0xC2) << 6; // Try reading input[1]. index++; if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } // Continuation bytes are of the form [10xxxxxx], which means that their two's // complement representation is in the range [-65..-128]. This allows us to // perform a single comparison to see if a byte is a continuation byte. int thisByteSignExtended = (sbyte)source[index]; if (thisByteSignExtended >= -64) { goto Invalid; } tempValue += (uint)thisByteSignExtended; tempValue += 0x80; // remove the continuation byte marker tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker if (tempValue < 0x0800) { Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); goto Finish; // this is a valid 2-byte sequence } // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have // enough information (from just two code units) to detect overlong or surrogate // sequences, we need to perform these checks now. if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) { // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. goto Invalid; } if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) { // This is a UTF-16 surrogate code point, which is invalid in UTF-8. goto Invalid; } if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) { // This is an overlong 4-byte sequence. goto Invalid; } // The first two bytes were just fine. We don't need to perform any other checks // on the remaining bytes other than to see that they're valid continuation bytes. // Try reading input[2]. index++; if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } thisByteSignExtended = (sbyte)source[index]; if (thisByteSignExtended >= -64) { goto Invalid; // this byte is not a UTF-8 continuation byte } tempValue <<= 6; tempValue += (uint)thisByteSignExtended; tempValue += 0x80; // remove the continuation byte marker tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker if (tempValue <= 0xFFFF) { Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); goto Finish; // this is a valid 3-byte sequence } // Try reading input[3]. index++; if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } thisByteSignExtended = (sbyte)source[index]; if (thisByteSignExtended >= -64) { goto Invalid; // this byte is not a UTF-8 continuation byte } tempValue <<= 6; tempValue += (uint)thisByteSignExtended; tempValue += 0x80; // remove the continuation byte marker tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); goto Finish; // this is a valid 4-byte sequence FirstByteInvalid: index = 1; // Invalid subsequences are always at least length 1. Invalid: Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 bytesConsumed = index; result = ReplacementChar; return(OperationStatus.InvalidData); NeedsMoreData: Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 bytesConsumed = index; result = ReplacementChar; return(OperationStatus.NeedMoreData); }
private static void _GetIndexAndOffset(uint value, out nuint index, out int offset) { UnicodeDebug.AssertIsBmpCodePoint(value); index = value >> 5; offset = (int)value & 0x1F; }