Ejemplo n.º 1
0
        public static int ComputeEncodedBytes(ReadOnlySpan <char> utf16source)
        {
            Span <byte> utf8Destination;

            unsafe {
                byte *buffer = stackalloc byte[32];
                utf8Destination = new Span <byte>(buffer, 32);
            }
            var utf16Bytes   = utf16source.Cast <char, byte>();
            int encodedBytes = 0;

            for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */)
            {
                UnicodeCodePoint codePoint;
                int consumedBytes;
                if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes))
                {
                    i += consumedBytes;
                    int justEncodedBytes;
                    if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes))
                    {
                        encodedBytes += justEncodedBytes;
                    }
                    else
                    {
                        throw new NotImplementedException("this should resize the buffer");
                    }
                }
                else
                {
                    throw new ArgumentOutOfRangeException(nameof(utf16source));
                }
            }
            return(encodedBytes);
        }
Ejemplo n.º 2
0
        // TODO: this routime needs to be optimized.
        public static bool TryEncode(ReadOnlySpan <char> utf16source, Span <byte> utf8Destination, out int encodedBytes)
        {
            var utf16Bytes = utf16source.Cast <char, byte>();

            encodedBytes = 0;
            for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */)
            {
                UnicodeCodePoint codePoint;
                int consumedBytes;
                if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes))
                {
                    i += consumedBytes;
                    int justEncodedBytes;
                    if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes))
                    {
                        utf8Destination = utf8Destination.Slice(justEncodedBytes);
                        encodedBytes   += justEncodedBytes;
                    }
                    else
                    {
                        return(false);
                    }
                }
                else
                {
                    throw new ArgumentOutOfRangeException(nameof(utf16source));
                }
            }
            return(true);
        }
        public unsafe ValueString CreateValueStringFromUtf8(byte[] arr, int start, int length)
        {
            if (start + length > arr.Length)
            {
                throw new ArgumentException();
            }
            EnsureSpace(length * 2);
            fixed(char *ptr = str)
            {
                var ptrx   = ptr + used;
                var offset = used;

                foreach (var codePoint in new Utf8String.CodePointEnumerable(arr, start, length))
                {
                    int charsEncoded;
                    if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, ptrx, out charsEncoded))
                    {
                        // TODO: Change Exception type
                        throw new Exception("invalid character");
                    }
                    ptrx += charsEncoded;
                }
                var strlen = (int)(ptrx - ptr - used);

                used += strlen;
                return(new ValueString(str, offset, strlen));
            }
        }
Ejemplo n.º 4
0
        private const byte b1111_1000U = 0xF8; //248

        #endregion Constants

        #region Decoding implementation

        /// <summary>
        /// Decodes a span of UTF-8 characters into UTF-16.
        ///
        /// This method will consume as many of the input characters as possible.
        ///
        /// On successful exit, the entire input was consumed and encoded successfully. In this case, <paramref name="bytesConsumed"/> will be
        /// equal to the length of the <paramref name="utf8"/> and <paramref name="charactersWritten"/> will equal the total number of bytes written to
        /// the <paramref name="utf16"/>.
        ///
        /// On unsuccessful exit, the following conditions can exist.
        ///  1) If the output buffer has been filled and no more input characters can be encoded, another call to this method with the input sliced to
        ///     exclude the already encoded characters (using <paramref name="bytesConsumed"/>) and a new output buffer will continue the encoding.
        ///  2) Encoding may have also stopped because the input buffer contains an invalid sequence.
        /// </summary>
        /// <param name="utf8">A span containing a sequence of UTF-8 characters.</param>
        /// <param name="utf16">A span to write the UTF-16 data into.</param>
        /// <param name="bytesConsumed">On exit, contains the number of code points that were consumed from the UTF-16 character span.</param>
        /// <param name="charactersWritten">An output parameter to store the number of characters written to <paramref name="utf16"/></param>
        /// <returns>True if the input buffer was fully encoded into the output buffer, otherwise false.</returns>
        public static bool TryDecode(ReadOnlySpan <byte> utf8, Span <char> utf16, out int bytesConsumed, out int charactersWritten)
        {
            bytesConsumed     = 0;
            charactersWritten = 0;

            while (bytesConsumed < utf8.Length)
            {
                uint codePoint;
                int  consumed;

                if (!TryDecodeCodePoint(utf8, bytesConsumed, out codePoint, out consumed))
                {
                    return(false);
                }

                int written;
                if (!Utf16LittleEndianEncoder.TryEncode(codePoint, utf16, charactersWritten, out written))
                {
                    return(false);
                }

                charactersWritten += written;
                bytesConsumed     += consumed;
            }

            return(true);
        }
Ejemplo n.º 5
0
        public override string ToString()
        {
            // get length first
            // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything)
            // TODO: is compiler gonna do the right thing here?
            // TODO: Should we use Linq's Count()?
            int len = 0;

            foreach (var codePoint in CodePoints)
            {
                len++;
                if (!UnicodeCodePoint.IsBmp(codePoint))
                {
                    len++;
                }
            }

            unsafe
            {
                Span <byte> buffer;
                char *      stackChars = null;
                char[]      characters = null;

                if (len <= 256)
                {
                    char *stackallocedChars = stackalloc char[len];
                    stackChars = stackallocedChars;
                    buffer     = new Span <byte>(stackChars, len * 2);
                }
                else
                {
                    // HACK: Can System.Buffers be used here?
                    characters = new char[len];
                    buffer     = characters.Slice().Cast <char, byte>();
                }

                foreach (var codePoint in CodePoints)
                {
                    int bytesEncoded;
                    if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded))
                    {
                        // TODO: Change Exception type
                        throw new Exception("invalid character");
                    }
                    buffer = buffer.Slice(bytesEncoded);
                }

                // TODO: We already have a char[] and this will copy, how to avoid that
                return(stackChars != null
                    ? new string(stackChars, 0, len)
                    : new string(characters));
            }
        }
Ejemplo n.º 6
0
        public override string ToString()
        {
            // get length first
            // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything)
            // TODO: is compiler gonna do the right thing here?
            // TODO: Should we use Linq's Count()?
            int len = 0;

            foreach (var codePoint in CodePoints)
            {
                len++;
                if (UnicodeCodePoint.IsSurrogate(codePoint))
                {
                    len++;
                }
            }

            char[] characters = new char[len];
            unsafe
            {
                fixed(char *pinnedCharacters = characters)
                {
                    Span <byte> buffer = new Span <byte>((byte *)pinnedCharacters, len * 2);

                    foreach (var codePoint in CodePoints)
                    {
                        int bytesEncoded;
                        if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded))
                        {
                            // TODO: Change Exception type
                            throw new Exception("invalid character");
                        }
                        buffer = buffer.Slice(bytesEncoded);
                    }
                }
            }

            // TODO: We already have a char[] and this will copy, how to avoid that
            return(new string(characters));
        }
Ejemplo n.º 7
0
        private static void AppendStringInternal(this IWriter writer, string value, int startIndex, int endIndex)
        {
            for (var i = startIndex; i <= endIndex;)
            {
                UnicodeCodePoint codePoint;
                int encodedChars;
                var success = Utf16LittleEndianEncoder.TryDecodeCodePointFromString(value, i, out codePoint, out encodedChars);
                if (!success)
                {
                    throw new ArgumentException();
                }
                i += encodedChars;

                int encodedBytes;
                success = Utf8Encoder.TryEncodeCodePoint(
                    codePoint,
                    writer.GetFreeBuffer(MaxUtf8CodePointBytes).ToSpan(),
                    out encodedBytes);
                Debug.Assert(success);
                writer.CommitBytes(encodedBytes);
            }
        }
Ejemplo n.º 8
0
        // TODO: This should return Utf16CodeUnits which should wrap byte[]/Span<byte>, same for other encoders
        private static byte[] GetUtf8BytesFromString(string s)
        {
            int len = 0;

            for (int i = 0; i < s.Length; /* intentionally no increment */)
            {
                UnicodeCodePoint codePoint;
                int encodedChars;
                if (!Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars))
                {
                    throw new ArgumentException("s", "Invalid surrogate pair in the string.");
                }

                if (encodedChars <= 0)
                {
                    // TODO: Fix exception type
                    throw new Exception("internal error");
                }

                int encodedBytes = Utf8Encoder.GetNumberOfEncodedBytes(codePoint);
                if (encodedBytes == 0)
                {
                    // TODO: Fix exception type
                    throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range");
                }
                len += encodedBytes;

                i += encodedChars;
            }

            byte[] bytes = new byte[len];
            unsafe
            {
                fixed(byte *array_pinned = bytes)
                {
                    Span <byte> p = new Span <byte>(array_pinned, len);

                    for (int i = 0; i < s.Length; /* intentionally no increment */)
                    {
                        UnicodeCodePoint codePoint;
                        int encodedChars;
                        if (Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars))
                        {
                            i += encodedChars;
                            int encodedBytes;
                            if (Utf8Encoder.TryEncodeCodePoint(codePoint, p, out encodedBytes))
                            {
                                p = p.Slice(encodedBytes);
                            }
                            else
                            {
                                // TODO: Fix exception type
                                throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range or the buffer is too small");
                            }
                        }
                        else
                        {
                            // TODO: Fix exception type
                            throw new Exception("Internal error: we did pre-validation of the string, nothing should go wrong");
                        }
                    }
                }
            }

            return(bytes);
        }