コード例 #1
0
        // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
        // the next char would have been consumed from / the next byte would have been written to.
        // inputLength in chars, outputBytesRemaining in bytes.
        public static OperationStatus TranscodeToUtf8(char *pInputBuffer, int inputLength, byte *pOutputBuffer, int outputBytesRemaining, out char *pInputBufferRemaining, out byte *pOutputBufferRemaining)
        {
            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");

            Debug.Assert(outputBytesRemaining >= 0, "Destination length must not be negative.");
            Debug.Assert(pOutputBuffer != null || outputBytesRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");


            var input  = new ReadOnlySpan <char>(pInputBuffer, inputLength);
            var output = new Span <byte>(pOutputBuffer, outputBytesRemaining);

            OperationStatus opStatus = OperationStatus.Done;

            while (!input.IsEmpty)
            {
                opStatus = Rune.DecodeFromUtf16(input, out Rune rune, out int charsConsumedJustNow);
                if (opStatus != OperationStatus.Done)
                {
                    break;
                }
                if (!rune.TryEncodeToUtf8(output, out int bytesWrittenJustNow))
                {
                    opStatus = OperationStatus.DestinationTooSmall; break;
                }
                input  = input.Slice(charsConsumedJustNow);
                output = output.Slice(bytesWrittenJustNow);
            }

            pInputBufferRemaining  = pInputBuffer + inputLength - input.Length;
            pOutputBufferRemaining = pOutputBuffer + outputBytesRemaining - output.Length;

            return(opStatus);
        }
コード例 #2
0
        private static bool TryParseSearchTermAsRune(object searchTerm, out Rune parsed)
        {
            if (searchTerm is char ch)
            {
                return(Rune.TryCreate(ch, out parsed));
            }
            else if (searchTerm is Rune r)
            {
                parsed = r;
                return(true);
            }
            else if (searchTerm is string str)
            {
                if (Rune.DecodeFromUtf16(str, out parsed, out int charsConsumed) == OperationStatus.Done &&
                    charsConsumed == str.Length)
                {
                    return(true);
                }
            }
            else if (searchTerm is ustring ustr)
            {
                if (Rune.DecodeFromUtf8(ustr.AsBytes(), out parsed, out int bytesConsumed) == OperationStatus.Done &&
                    bytesConsumed == ustr.Length)
                {
                    return(true);
                }
            }

            parsed = default; // failed to turn the search term into a single Rune
            return(false);
        }
コード例 #3
0
ファイル: TextEncoder.cs プロジェクト: mikem8361/runtime
        // skips the call to FindFirstCharacterToEncode
        private protected virtual OperationStatus EncodeCore(ReadOnlySpan <char> source, Span <char> destination, out int charsConsumed, out int charsWritten, bool isFinalBlock)
        {
            int originalSourceLength      = source.Length;
            int originalDestinationLength = destination.Length;

            while (!source.IsEmpty)
            {
                OperationStatus status = Rune.DecodeFromUtf16(source, out Rune scalarValue, out int charsConsumedJustNow);
                if (status != OperationStatus.Done)
                {
                    if (!isFinalBlock && status == OperationStatus.NeedMoreData)
                    {
                        goto NeedMoreData;
                    }

                    Debug.Assert(scalarValue == Rune.ReplacementChar); // should be replacement char
                    goto MustEncode;
                }

                if (!WillEncode(scalarValue.Value))
                {
                    if (!scalarValue.TryEncodeToUtf16(destination, out _))
                    {
                        goto DestinationTooSmall;
                    }
                    source      = source.Slice(charsConsumedJustNow);
                    destination = destination.Slice(charsConsumedJustNow); // reflecting input directly to the output, same # of chars written
                    continue;
                }

MustEncode:

                if (!TryEncodeUnicodeScalar((uint)scalarValue.Value, destination, out int charsWrittenJustNow))
                {
                    goto DestinationTooSmall;
                }

                source      = source.Slice(charsConsumedJustNow);
                destination = destination.Slice(charsWrittenJustNow);
            }

            // And we're finished!

            OperationStatus retVal = OperationStatus.Done;

ReturnCommon:
            charsConsumed = originalSourceLength - source.Length;
            charsWritten  = originalDestinationLength - destination.Length;
            return(retVal);

NeedMoreData:
            retVal = OperationStatus.NeedMoreData;
            goto ReturnCommon;

DestinationTooSmall:
            retVal = OperationStatus.DestinationTooSmall;
            goto ReturnCommon;
        }
コード例 #4
0
        public Utf16Splitter(ReadOnlySpan <char> span, ReadOnlySpan <char> separator, StringSplitOptions splitOptions)
        {
            ReadOnlySpan <char> separatorSlice = separator;

            do
            {
                var separatorStatus = Rune.DecodeFromUtf16(separatorSlice, out _, out int consumed);
                if (separatorStatus == OperationStatus.InvalidData)
                {
                    throw new ArgumentException("The separator is not valid UTF16.", nameof(separator));
                }
                separatorSlice = separatorSlice[consumed..];
コード例 #5
0
        private string GetExpectedEscapedRepresentation(string value)
        {
            StringBuilder builder = new StringBuilder();

            for (int i = 0; i < value.Length;)
            {
                Rune.DecodeFromUtf16(value.AsSpan(i), out Rune nextRune, out int charsConsumed);
                builder.Append(GetExpectedEscapedRepresentation(nextRune));
                i += charsConsumed;
            }
            return(builder.ToString());
        }
コード例 #6
0
 // <SnippetExample>
 static ReadOnlySpan <char> TrimNonLettersAndNonDigits(ReadOnlySpan <char> span)
 {
     // First, trim from the front.
     // If any Rune can't be decoded
     // (return value is anything other than "Done"),
     // or if the Rune is a letter or digit,
     // stop trimming from the front and
     // instead work from the end.
     while (Rune.DecodeFromUtf16(span, out Rune rune, out int charsConsumed) == OperationStatus.Done)
     {
         if (Rune.IsLetterOrDigit(rune))
         {
             break;
         }
         span = span[charsConsumed..];
コード例 #7
0
ファイル: TextEncoder.cs プロジェクト: mikem8361/runtime
        private bool TryEncodeUnicodeScalarUtf8(uint unicodeScalar, Span <char> utf16ScratchBuffer, Span <byte> utf8Destination, out int bytesWritten)
        {
            if (!TryEncodeUnicodeScalar(unicodeScalar, utf16ScratchBuffer, out int charsWritten))
            {
                // We really don't expect any encoder to exceed 24 escaped chars per input scalar.
                // If this happens, throw an exception and we can figure out if we want to support it
                // in the future.
                ThrowArgumentException_MaxOutputCharsPerInputChar();
            }

            // Transcode chars -> bytes one at a time.

            utf16ScratchBuffer = utf16ScratchBuffer.Slice(0, charsWritten);
            int dstIdx = 0;

            while (!utf16ScratchBuffer.IsEmpty)
            {
                if (Rune.DecodeFromUtf16(utf16ScratchBuffer, out Rune nextScalarValue, out int scalarUtf16CodeUnitCount) != OperationStatus.Done)
                {
                    // Wrote bad UTF-16 data, we cannot transcode to UTF-8.
                    ThrowArgumentException_MaxOutputCharsPerInputChar();
                }

                uint utf8lsb = (uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue((uint)nextScalarValue.Value);
                do
                {
                    if (SpanUtility.IsValidIndex(utf8Destination, dstIdx))
                    {
                        utf8Destination[dstIdx++] = (byte)utf8lsb;
                    }
                    else
                    {
                        bytesWritten = 0; // ran out of space in the destination
                        return(false);
                    }
                } while ((utf8lsb >>= 8) != 0);

                utf16ScratchBuffer = utf16ScratchBuffer.Slice(scalarUtf16CodeUnitCount);
            }

            bytesWritten = dstIdx;
            return(true);
        }
コード例 #8
0
        public override unsafe int FindFirstCharacterToEncode(char *text, int textLength)
        {
            ReadOnlySpan <char> input = new ReadOnlySpan <char>(text, textLength);
            int idx = 0;

            while (Rune.DecodeFromUtf16(input.Slice(idx), out Rune result, out int charsConsumed) == OperationStatus.Done)
            {
                if (WillEncode(result.Value))
                {
                    // This character needs to be escaped. Break out.
                    break;
                }
                idx += charsConsumed;
            }

            if (idx == input.Length)
            {
                // None of the characters in the string needs to be escaped.
                return(-1);
            }
            return(idx);
        }
コード例 #9
0
        private static bool IcuIsSortable(ReadOnlySpan <char> text)
        {
            Debug.Assert(!GlobalizationMode.Invariant);
            Debug.Assert(!GlobalizationMode.UseNls);
            Debug.Assert(!text.IsEmpty);

            do
            {
                if (Rune.DecodeFromUtf16(text, out Rune result, out int charsConsumed) != OperationStatus.Done)
                {
                    return(false); // found an unpaired surrogate somewhere in the text
                }

                UnicodeCategory category = Rune.GetUnicodeCategory(result);
                if (category == UnicodeCategory.PrivateUse || category == UnicodeCategory.OtherNotAssigned)
                {
                    return(false); // can't sort private use or unassigned code points
                }

                text = text.Slice(charsConsumed);
            } while (!text.IsEmpty);

            return(true); // saw no unsortable data in the buffer
        }
コード例 #10
0
ファイル: RuneTests.cs プロジェクト: lateralusX/runtime
 [InlineData(new char[] { '\ud800', '\u1234' }, OperationStatus.InvalidData, 0xFFFD, 1)] // standalone high surrogate
 public static void DecodeFromUtf16(char[] data, OperationStatus expectedOperationStatus, int expectedRuneValue, int expectedCharsConsumed)
 {
     Assert.Equal(expectedOperationStatus, Rune.DecodeFromUtf16(data, out Rune actualRune, out int actualCharsConsumed));
     Assert.Equal(expectedRuneValue, actualRune.Value);
     Assert.Equal(expectedCharsConsumed, actualCharsConsumed);
 }
コード例 #11
0
        public void RunTest()
        {
            Console.WriteLine("-- BEGIN TEST --");

            int encodingCharCount = Encoding.UTF8.GetCharCount(_data.Span);

            Console.WriteLine($"Encoding.UTF8.GetCharCount returned {encodingCharCount}.");

            {
                ReadOnlySpan <byte> input = _data.Span;
                int runeIterCharCount     = 0;
                while (!input.IsEmpty)
                {
                    Rune.DecodeFromUtf8(input, out Rune thisRune, out int bytesConsumed);
                    runeIterCharCount += thisRune.Utf16SequenceLength; // ok if U+FFFD replacement
                    input              = input.Slice(bytesConsumed);
                }

                Console.WriteLine($"Rune iteration said there were {runeIterCharCount} UTF-16 chars.");

                if (encodingCharCount != runeIterCharCount)
                {
                    throw new Exception("Rune iteration char count mismatch!!");
                }
            }

            char[] chars        = new char[encodingCharCount];
            int    charsWritten = Encoding.UTF8.GetChars(_data.Span, chars);

            Console.WriteLine($"Encoding.UTF8.GetChars returned {charsWritten} chars written.");

            if (encodingCharCount != charsWritten)
            {
                throw new Exception("GetChars return value mismatch!!");
            }

            {
                ReadOnlySpan <byte> inputUtf8  = _data.Span;
                ReadOnlySpan <char> inputUtf16 = chars;

                while (!inputUtf8.IsEmpty && !inputUtf16.IsEmpty)
                {
                    Rune.DecodeFromUtf8(inputUtf8, out Rune inputUtf8Rune, out int bytesConsumed);
                    Rune.DecodeFromUtf16(inputUtf16, out Rune inputUtf16Rune, out int charsConsumed);

                    if (inputUtf8Rune != inputUtf16Rune)
                    {
                        throw new Exception("Enumerating runes mismatch!!");
                    }

                    inputUtf8  = inputUtf8.Slice(bytesConsumed);
                    inputUtf16 = inputUtf16.Slice(charsConsumed);
                }

                if (inputUtf8.Length != inputUtf16.Length)
                {
                    throw new Exception("Rune enumeration returned mismatched lengths!");
                }
            }

            Console.WriteLine("Running ToUtf16 with replace=true and exact size buffer.");

            {
                char[]          chars2   = new char[chars.Length];
                OperationStatus opStatus = Utf8.ToUtf16(_data.Span, chars2, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: true, isFinalBlock: true);

                if (opStatus != OperationStatus.Done)
                {
                    throw new Exception("Utf8.ToUtf16 returned wrong OperationStatus!!");
                }

                if (bytesReadJustNow != _data.Memory.Length)
                {
                    throw new Exception("Utf8.ToUtf16 didn't read entire input!!");
                }

                if (charsWrittenJustNow != chars2.Length)
                {
                    throw new Exception("Utf8.ToUtf16 didn't fill entire response buffer!!");
                }

                if (!chars.SequenceEqual(chars2))
                {
                    throw new Exception("Utf8.ToUtf16 returned different data than Encoding.UTF8.GetChars!!");
                }
            }

            Console.WriteLine("Running ToUtf16 with replace=true and extra large buffer.");

            {
                char[]          chars2   = new char[chars.Length + 1024];
                OperationStatus opStatus = Utf8.ToUtf16(_data.Span, chars2, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: true, isFinalBlock: true);

                if (opStatus != OperationStatus.Done)
                {
                    throw new Exception("Utf8.ToUtf16 returned wrong OperationStatus!!");
                }

                if (bytesReadJustNow != _data.Memory.Length)
                {
                    throw new Exception("Utf8.ToUtf16 didn't read entire input!!");
                }

                if (charsWrittenJustNow != chars.Length)
                {
                    throw new Exception("Utf8.ToUtf16 didn't fill entire response buffer!!");
                }

                if (!chars2.AsSpan(0, charsWrittenJustNow).SequenceEqual(chars))
                {
                    throw new Exception("Utf8.ToUtf16 returned different data than Encoding.UTF8.GetChars!!");
                }
            }

            Console.WriteLine("Running ToUtf16 with replace=false and extra large buffer.");

            {
                ReadOnlySpan <byte> input  = _data.Span;
                Span <char>         output = new char[chars.Length + 1024];

                while (!input.IsEmpty)
                {
                    OperationStatus opStatus = Utf8.ToUtf16(input, output, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: false, isFinalBlock: true);

                    ReadOnlySpan <byte> dataReadJustNow    = input.Slice(0, bytesReadJustNow);
                    ReadOnlySpan <char> dataWrittenJustNow = output.Slice(0, charsWrittenJustNow);

                    while (!dataReadJustNow.IsEmpty && !dataWrittenJustNow.IsEmpty)
                    {
                        OperationStatus utf8Status  = Rune.DecodeFromUtf8(dataReadJustNow, out Rune inputUtf8Rune, out int bytesConsumed);
                        OperationStatus utf16Status = Rune.DecodeFromUtf16(dataWrittenJustNow, out Rune inputUtf16Rune, out int charsConsumed);

                        if (utf8Status != OperationStatus.Done)
                        {
                            throw new Exception("DecodeFromUtf8 returned unexpected value!!");
                        }

                        if (utf16Status != OperationStatus.Done)
                        {
                            throw new Exception("DecodeFromUtf16 returned unexpected value!!");
                        }

                        if (inputUtf8Rune != inputUtf16Rune)
                        {
                            throw new Exception("Enumerating runes mismatch!!");
                        }

                        dataReadJustNow    = dataReadJustNow.Slice(bytesConsumed);
                        dataWrittenJustNow = dataWrittenJustNow.Slice(charsConsumed);
                    }

                    if (dataReadJustNow.Length != dataWrittenJustNow.Length)
                    {
                        throw new Exception("Unexpected length mismatch!!");
                    }

                    input = input.Slice(bytesReadJustNow);

                    if (opStatus != OperationStatus.Done)
                    {
                        // Skip over invalid data

                        Rune.DecodeFromUtf8(input, out _, out int bytesToSkip);
                        input = input.Slice(bytesToSkip);
                    }
                }
            }

            Console.WriteLine("Trying custom decoder replacement.");

            {
                // use a custom replacement string
                Encoding encoding = Encoding.GetEncoding("utf-8", EncoderFallback.ExceptionFallback, new DecoderReplacementFallback("{BAD}"));

                string decoded = encoding.GetString(_data.Span);

                ReadOnlySpan <byte> input = _data.Span;
                char[]        decoded2    = new char[decoded.Length];
                StringBuilder builder     = new StringBuilder();

                while (!input.IsEmpty)
                {
                    OperationStatus opStatus = Utf8.ToUtf16(input, decoded2, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: false, isFinalBlock: true);
                    builder.Append(decoded2, 0, charsWrittenJustNow);

                    input = input.Slice(bytesReadJustNow);

                    if (opStatus != OperationStatus.Done)
                    {
                        // Skip over invalid data

                        Rune.DecodeFromUtf8(input, out _, out int bytesToSkip);
                        input = input.Slice(bytesToSkip);

                        builder.Append("{BAD}");
                    }
                }

                if (new string(decoded) != builder.ToString())
                {
                    throw new Exception("Custom decoder replacement failed!!");
                }
            }

            Console.WriteLine("-- END TEST - SUCCESS --");
        }
コード例 #12
0
ファイル: Utf8.cs プロジェクト: Fredo-Q/dotnet-coreclr
        /*
         * OperationStatus-based APIs for transcoding of chunked data.
         * This method is similar to Encoding.UTF8.GetBytes / GetChars but has a
         * different calling convention, different error handling mechanisms, and
         * different performance characteristics.
         *
         * If 'replaceInvalidSequences' is true, the method will replace any ill-formed
         * subsequence in the source with U+FFFD when transcoding to the destination,
         * then it will continue processing the remainder of the buffers. Otherwise
         * the method will return OperationStatus.InvalidData.
         *
         * If the method does return an error code, the out parameters will represent
         * how much of the data was successfully transcoded, and the location of the
         * ill-formed subsequence can be deduced from these values.
         *
         * If 'replaceInvalidSequences' is true, the method is guaranteed never to return
         * OperationStatus.InvalidData. If 'isFinalBlock' is true, the method is
         * guaranteed never to return OperationStatus.NeedMoreData.
         */

        /// <summary>
        /// Transcodes the UTF-16 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-8.
        /// </summary>
        /// <remarks>
        /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-16 sequences
        /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
        /// this method will not return <see cref="OperationStatus.InvalidData"/>.
        /// </remarks>
        public static OperationStatus FromUtf16(ReadOnlySpan <char> source, Span <byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
        {
            int             originalSourceLength      = source.Length;
            int             originalDestinationLength = destination.Length;
            OperationStatus status = OperationStatus.Done;

            // In a loop, this is going to read and transcode one scalar value at a time
            // from the source to the destination.

            while (!source.IsEmpty)
            {
                status = Rune.DecodeFromUtf16(source, out Rune firstScalarValue, out int charsConsumed);

                switch (status)
                {
                case OperationStatus.NeedMoreData:

                    // Input buffer ended with a high surrogate. Only treat this as an error
                    // if the caller told us that we shouldn't expect additional data in a
                    // future call.

                    if (!isFinalBlock)
                    {
                        goto Finish;
                    }

                    status = OperationStatus.InvalidData;
                    goto case OperationStatus.InvalidData;

                case OperationStatus.InvalidData:

                    // Input buffer contained invalid data. If the caller told us not to
                    // perform U+FFFD replacement, terminate the loop immediately and return
                    // an error to the caller.

                    if (!replaceInvalidSequences)
                    {
                        goto Finish;
                    }

                    firstScalarValue = Rune.ReplacementChar;
                    goto default;

                default:

                    // We know which scalar value we need to transcode to UTF-8.
                    // Do so now, and only terminate the loop if we ran out of space
                    // in the destination buffer.

                    if (firstScalarValue.TryEncodeToUtf8(destination, out int bytesWritten))
                    {
                        source      = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution
                        destination = destination.Slice(bytesWritten);
                        status      = OperationStatus.Done;        // forcibly set success
                        continue;
                    }
                    else
                    {
                        status = OperationStatus.DestinationTooSmall;
                        goto Finish;
                    }
                }
            }

Finish:

            numCharsRead    = originalSourceLength - source.Length;
            numBytesWritten = originalDestinationLength - destination.Length;

            Debug.Assert((status == OperationStatus.Done) == (numCharsRead == originalSourceLength),
                         "Should report OperationStatus.Done if and only if we've consumed the entire input buffer.");

            return(status);
        }
コード例 #13
0
        /// <summary>
        /// Peek at the next <see cref="Rune"/> without advancing the position
        /// </summary>
        /// <param name="charsConsumed">The amount of <see cref="Char"/> used by the <see cref="Rune"/>.</param>
        /// <returns>The next <see cref="Rune"/> in the <see cref="Source"/>.</returns>
        internal Rune PeekRune(out Int32 charsConsumed)
        {
            ReadOnlySpan <Char> peek = Peek(2);

            switch (Rune.DecodeFromUtf16(peek, out Rune result, out charsConsumed))
            {