Exemple #1
0
        private static bool AreEquivalentOrdinalSkipShortCircuitingChecks(ReadOnlySpan <byte> utf8Text, ReadOnlySpan <char> utf16Text)
        {
            while (!utf16Text.IsEmpty)
            {
                // If the next UTF-16 subsequence is malformed or incomplete, or if the next
                // UTF-8 subsequence is malformed or incomplete, or if they don't decode to
                // the exact same Unicode scalar value, fail.
                //
                // The Rune.DecodeFrom* APIs handle empty inputs just fine and return "Incomplete".

                // TODO_UTF8STRING: If we assume Utf8String contains well-formed UTF-8, we could
                // create a version of this method that calls a faster implementation of DecodeFromUtf8.
                // We'd need to be careful not to call that optimized routine if the user passed
                // us a normal ROS<byte> that didn't originate from a Utf8String or similar.

                if (Rune.DecodeFromUtf16(utf16Text, out Rune scalarFromUtf16, out int charsConsumedJustNow) != OperationStatus.Done ||
                    Rune.DecodeFromUtf8(utf8Text, out Rune scalarFromUtf8, out int bytesConsumedJustNow) != OperationStatus.Done ||
                    scalarFromUtf16 != scalarFromUtf8)
                {
                    return(false);
                }

                // TODO_UTF8STRING: As an optimization, we could perform unsafe slices below.

                utf16Text = utf16Text.Slice(charsConsumedJustNow);
                utf8Text  = utf8Text.Slice(bytesConsumedJustNow);
            }

            // We decoded the entire UTF-16 input, and so far it has matched the decoded form
            // of the UTF-8 input. Now just make sure we've also decoded the entirety of the
            // UTF-8 data, otherwise the input strings aren't equivalent.

            return(utf8Text.IsEmpty);
        }
Exemple #2
0
                public bool MoveNext()
                {
                    // Make copies of fields to avoid tearing issues since we're
                    // about to perform unsafe accesses.

                    uint currentCharPair = _currentCharPair;

                    if (currentCharPair > char.MaxValue)
                    {
                        // There was a surrogate pair smuggled in here from a previous operation.
                        // Shift out the high surrogate value and return immediately.

                        _currentCharPair = currentCharPair >> 16;
                        return(true);
                    }

                    ReadOnlySpan <byte> bytes = _obj.AsBytesSkipNullCheck();
                    int nextByteIdx           = _nextByteIdx;

                    if ((uint)nextByteIdx >= (uint)bytes.Length)
                    {
                        return(false); // no more data
                    }

                    // TODO_UTF8STRING: Can we skip correctness checks below?
                    // Perhaps not, this enumerator struct is potentially tearable.

                    OperationStatus status = Rune.DecodeFromUtf8(bytes.Slice(nextByteIdx), out Rune currentRune, out int bytesConsumedJustNow);

                    Debug.Assert(status == OperationStatus.Done);

                    _nextByteIdx = nextByteIdx + bytesConsumedJustNow;

                    if (currentRune.IsBmp)
                    {
                        // Common case - BMP scalar value.

                        _currentCharPair = (uint)currentRune.Value;
                    }
                    else
                    {
                        // Uncommon case - supplementary (astral) plane scalar value.
                        // We'll smuggle the two UTF-16 code units into a single 32-bit value,
                        // with the leading surrogate packed into the low 16 bits of the value,
                        // and the trailing surrogate packed into the high 16 bits of the value.

                        UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar((uint)currentRune.Value, out char leadingCodeUnit, out char trailingCodeUnit);
                        _currentCharPair = (uint)leadingCodeUnit + ((uint)trailingCodeUnit << 16);
                    }

                    return(true);
                }
        public static unsafe int UnescapePercentEncodedUTF8Sequence(char *input, int length, ref ValueStringBuilder dest, bool isQuery, bool iriParsing)
        {
            // The following assertions rely on the input not mutating mid-operation, as is the case currently since callers are working with strings
            // If we start accepting input such as spans, this method must be audited to ensure no buffer overruns/infinite loops could occur

            // As an optimization, this method should only be called after the first character is known to be a part of a non-ascii UTF8 sequence
            Debug.Assert(length >= 3);
            Debug.Assert(input[0] == '%');
            Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) != Uri.c_DummyChar);
            Debug.Assert(UriHelper.DecodeHexChars(input[1], input[2]) >= 128);

            uint fourByteBuffer    = 0;
            int  bytesLeftInBuffer = 0;

            int totalCharsConsumed = 0;
            int charsToCopy        = 0;
            int bytesConsumed      = 0;

RefillBuffer:
            int i = totalCharsConsumed + (bytesLeftInBuffer * 3);

ReadByteFromInput:
            if ((uint)(length - i) <= 2 || input[i] != '%')
            {
                goto NoMoreOrInvalidInput;
            }

            uint value = input[i + 1];

            if ((uint)((value - 'A') & ~0x20) <= ('F' - 'A'))
            {
                value = (value | 0x20) - 'a' + 10;
            }
            else if ((value - '8') <= ('9' - '8'))
            {
                value -= '0';
            }
            else
            {
                goto NoMoreOrInvalidInput;  // First character wasn't hex or was <= 7F (Ascii)
            }
            uint second = (uint)input[i + 2] - '0';

            if (second <= 9)
            {
                // second is already [0, 9]
            }
            else if ((uint)((second - ('A' - '0')) & ~0x20) <= ('F' - 'A'))
            {
                second = ((second + '0') | 0x20) - 'a' + 10;
            }
            else
            {
                goto NoMoreOrInvalidInput;  // Second character wasn't Hex
            }
            value = (value << 4) | second;

            Debug.Assert(value >= 128);

            // Rotate the buffer and overwrite the last byte
            if (BitConverter.IsLittleEndian)
            {
                fourByteBuffer = (fourByteBuffer >> 8) | (value << 24);
            }
            else
            {
                fourByteBuffer = (fourByteBuffer << 8) | value;
            }

            if (++bytesLeftInBuffer != 4)
            {
                i += 3;
                goto ReadByteFromInput;
            }

DecodeRune:
            Debug.Assert(totalCharsConsumed % 3 == 0);
            Debug.Assert(bytesLeftInBuffer == 2 || bytesLeftInBuffer == 3 || bytesLeftInBuffer == 4);
            Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00000080 : 0x80000000)) != 0);
            Debug.Assert((fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00008000 : 0x00800000)) != 0);
            Debug.Assert(bytesLeftInBuffer < 3 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x00800000 : 0x00008000)) != 0);
            Debug.Assert(bytesLeftInBuffer < 4 || (fourByteBuffer & (BitConverter.IsLittleEndian ? 0x80000000 : 0x00000080)) != 0);

            uint temp = fourByteBuffer; // make a copy so that the *copy* (not the original) is marked address-taken

            if (Rune.DecodeFromUtf8(new ReadOnlySpan <byte>(&temp, bytesLeftInBuffer), out Rune rune, out bytesConsumed) == OperationStatus.Done)
            {
                Debug.Assert(bytesConsumed >= 2, $"Rune.DecodeFromUtf8 consumed {bytesConsumed} bytes, likely indicating input was modified concurrently during UnescapePercentEncodedUTF8Sequence's execution");

                if (!iriParsing || IriHelper.CheckIriUnicodeRange((uint)rune.Value, isQuery))
                {
                    if (charsToCopy != 0)
                    {
                        dest.Append(input + totalCharsConsumed - charsToCopy, charsToCopy);
                        charsToCopy = 0;
                    }

                    dest.Append(rune);
                    goto AfterDecodeRune;
                }
            }