private static bool AreEquivalentOrdinalSkipShortCircuitingChecks(ReadOnlySpan <byte> utf8Text, ReadOnlySpan <char> utf16Text) { while (!utf16Text.IsEmpty) { // If the next UTF-16 subsequence is malformed or incomplete, or if the next // UTF-8 subsequence is malformed or incomplete, or if they don't decode to // the exact same Unicode scalar value, fail. // // The Rune.DecodeFrom* APIs handle empty inputs just fine and return "Incomplete". // TODO_UTF8STRING: If we assume Utf8String contains well-formed UTF-8, we could // create a version of this method that calls a faster implementation of DecodeFromUtf8. // We'd need to be careful not to call that optimized routine if the user passed // us a normal ROS<byte> that didn't originate from a Utf8String or similar. if (Rune.DecodeFromUtf16(utf16Text, out Rune scalarFromUtf16, out int charsConsumedJustNow) != OperationStatus.Done || Rune.DecodeFromUtf8(utf8Text, out Rune scalarFromUtf8, out int bytesConsumedJustNow) != OperationStatus.Done || scalarFromUtf16 != scalarFromUtf8) { return(false); } // TODO_UTF8STRING: As an optimization, we could perform unsafe slices below. utf16Text = utf16Text.Slice(charsConsumedJustNow); utf8Text = utf8Text.Slice(bytesConsumedJustNow); } // We decoded the entire UTF-16 input, and so far it has matched the decoded form // of the UTF-8 input. Now just make sure we've also decoded the entirety of the // UTF-8 data, otherwise the input strings aren't equivalent. return(utf8Text.IsEmpty); }
public bool MoveNext() { // Make copies of fields to avoid tearing issues since we're // about to perform unsafe accesses. uint currentCharPair = _currentCharPair; if (currentCharPair > char.MaxValue) { // There was a surrogate pair smuggled in here from a previous operation. // Shift out the high surrogate value and return immediately. _currentCharPair = currentCharPair >> 16; return(true); } ReadOnlySpan <byte> bytes = _obj.AsBytesSkipNullCheck(); int nextByteIdx = _nextByteIdx; if ((uint)nextByteIdx >= (uint)bytes.Length) { return(false); // no more data } // TODO_UTF8STRING: Can we skip correctness checks below? // Perhaps not, this enumerator struct is potentially tearable. OperationStatus status = Rune.DecodeFromUtf8(bytes.Slice(nextByteIdx), out Rune currentRune, out int bytesConsumedJustNow); Debug.Assert(status == OperationStatus.Done); _nextByteIdx = nextByteIdx + bytesConsumedJustNow; if (currentRune.IsBmp) { // Common case - BMP scalar value. _currentCharPair = (uint)currentRune.Value; } else { // Uncommon case - supplementary (astral) plane scalar value. // We'll smuggle the two UTF-16 code units into a single 32-bit value, // with the leading surrogate packed into the low 16 bits of the value, // and the trailing surrogate packed into the high 16 bits of the value. UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar((uint)currentRune.Value, out char leadingCodeUnit, out char trailingCodeUnit); _currentCharPair = (uint)leadingCodeUnit + ((uint)trailingCodeUnit << 16); } return(true); }
public static unsafe int UnescapePercentEncodedUTF8Sequence(char *input, int length, ref ValueStringBuilder dest, bool isQuery, bool iriParsing) { // The following assertions rely on the input not mutating mid-operation, as is the case currently since callers are working with strings // If we start accepting input such as spans, this method must be audited to ensure no buffer overruns/infinite loops could occur // As an optimization, this method should only be called after the first character is known to be a part of a non-ascii UTF8 sequence Debug.Assert(length >= 3); Debug.Assert(input[0] == '%'); Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) != Uri.c_DummyChar); Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) >= 128); uint fourByteBuffer = 0; int bytesLeftInBuffer = 0; int totalCharsConsumed = 0; int charsToCopy = 0; int bytesConsumed = 0; RefillBuffer: int i = totalCharsConsumed + (bytesLeftInBuffer * 3); ReadByteFromInput: if ((uint)(length - i) <= 2 || input[i] != '%') { goto NoMoreOrInvalidInput; } uint value = input[i + 1]; if ((uint)((value - 'A') & ~0x20) <= ('F' - 'A')) { value = (value | 0x20) - 'a' + 10; } else if ((value - '8') <= ('9' - '8')) { value -= '0'; } else { goto NoMoreOrInvalidInput; // First character wasn't hex or was <= 7F (Ascii) } uint second = (uint)input[i + 2] - '0'; if (second <= 9) { // second is already [0, 9] } else if ((uint)((second - ('A' - '0')) & ~0x20) <= ('F' - 'A')) { second = ((second + '0') | 0x20) - 'a' + 10; } else { goto NoMoreOrInvalidInput; // Second character wasn't Hex } value = (value << 4) | second; Debug.Assert(value >= 128); // Rotate the buffer and overwrite the last byte if (BitConverter.IsLittleEndian) { fourByteBuffer = (fourByteBuffer >> 8) | (value << 24); } else { fourByteBuffer = (fourByteBuffer << 8) | value; } if (++bytesLeftInBuffer != 4) { i += 3; goto ReadByteFromInput; } DecodeRune: Debug.Assert(totalCharsConsumed % 3 == 0); Debug.Assert(bytesLeftInBuffer == 2 || bytesLeftInBuffer == 3 || bytesLeftInBuffer == 4); Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00000080 : 0x80000000)) != 0); Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00008000 : 0x00800000)) != 0); Debug.Assert(bytesLeftInBuffer < 3 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00800000 : 0x00008000)) != 0); Debug.Assert(bytesLeftInBuffer < 4 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x80000000 : 0x00000080)) != 0); uint temp = fourByteBuffer; // make a copy so that the *copy* (not the original) is marked address-taken if (Rune.DecodeFromUtf8(new ReadOnlySpan <byte>(&temp, bytesLeftInBuffer), out Rune rune, out bytesConsumed) == OperationStatus.Done) { Debug.Assert(bytesConsumed >= 2, $"Rune.DecodeFromUtf8 consumed {bytesConsumed} bytes, likely indicating input was modified concurrently during UnescapePercentEncodedUTF8Sequence's execution"); if (!iriParsing || IriHelper.CheckIriUnicodeRange((uint)rune.Value, isQuery)) { if (charsToCopy != 0) { dest.Append(input + totalCharsConsumed - charsToCopy, charsToCopy); charsToCopy = 0; } dest.Append(rune); goto AfterDecodeRune; } }