Пример #1
0
        public void Utf8ToUtf16_All()
        {
            // Validate that index conversions on "a*b" map correctly for every Unicode codepoint
            byte[] buffer = new byte[10];
            String8.Convert("a", ref buffer);

            for (int codepoint = 0; codepoint < 0x10FFFF; ++codepoint)
            {
                // Skip illegal codepoints
                if (codepoint >= 0xD800 && codepoint <= 0xDFFF)
                {
                    continue;
                }

                // Convert the codepoint to UTF16
                string value = char.ConvertFromUtf32(codepoint);

                // Append it to the String8 after 'a'
                String8 value8 = String8.Convert(value, ref buffer, 1);

                // Append 'b' after that
                String8.Convert("b", ref buffer, value8.Index + value8.Length);

                // Map the whole value
                String8 whole8 = new String8(buffer, 0, value8.Index + value8.Length + 1);

                // 'a' should always map to index 0
                Assert.Equal(0, String8.Utf8ToUtf16(0, whole8));

                // 'b' should always map to the last .NET char (the length needed for the .NET string + 1 for 'a')
                Assert.Equal(1 + value.Length, String8.Utf8ToUtf16(whole8.Length - 1, whole8));

                // All indices in between are the middle character (index 1, since it's after 'a')
                for (int i = 1; i < whole8.Length - 1; ++i)
                {
                    Assert.Equal(1, String8.Utf8ToUtf16(i, whole8));
                }
            }
        }
Пример #2
0
 public static String8 ConvertExpensively(string value)
 {
     byte[] buffer = null;
     return(String8.Convert(value, ref buffer));
 }
Пример #3
0
 public bool Equals(String8 other)
 {
     return(CompareTo(other) == 0);
 }
Пример #4
0
        /// <summary>
        ///  Translate a UTF-8 index (from an RE2 match) to a UTF-16 match (safe for indexing into a .NET string).
        ///  Takes an optional previous mapping, so it doesn't have to rescan from the beginning of the file each time.
        /// </summary>
        /// <remarks>
        /// <para>
        ///  UTF-8:
        ///    0xxxxxxx [lt 0x80] - First byte of single byte character
        ///    10xxxxxx [lt 0xC0] - Non-first byte of any multi-byte character
        ///    110xxxxx [lt 0xE0] - First byte of two byte character
        ///    1110xxxx [lt 0xF0] - First byte of three byte character
        ///    11110xxx [lt 0xF8] - First byte of four byte character.
        /// </para>
        /// <para>
        ///    All Unicode codepoints up to U+FFFF fit in one UTF-16 char.
        ///    UTF-8 can store 4 + 6 + 6 bits = 16 bits in a three byte encoding,
        ///    so all UTF-8 three byte and smaller characters will be one UTF-16 character.
        /// </para>
        /// </remarks>
        /// <param name="index">UTF-8 index to translate.</param>
        /// <param name="text8">String8 value.</param>
        /// <param name="previousUtf8Index">A previous UTF8 index translated, if available.</param>
        /// <param name="previousUtf16Index">The UTF-16 equivalent of the previous index, if available.</param>
        /// <returns>UTF-16 index corresponding to the UTF-8 index passed in.</returns>
        public static int Utf8ToUtf16(int index, String8 text8, int previousUtf8Index = 0, int previousUtf16Index = 0)
        {
            if (index < 0 || index > text8.Length)
            {
                throw new ArgumentOutOfRangeException(nameof(index));
            }
            if (text8.Array == null)
            {
                throw new ArgumentNullException("text.Array");
            }
            if (text8.Index < 0 || text8.Length < 0 || text8.Index + text8.Length > text8.Array.Length)
            {
                throw new ArgumentOutOfRangeException("text");
            }

            // The first character is always the same
            if (index == 0)
            {
                return(0);
            }

            int startIndex = 0;
            int utf16Index = 0;

            // Track from the previous match, if it's available and before the current one
            if (previousUtf8Index > 0 && previousUtf8Index <= index)
            {
                startIndex = previousUtf8Index;
                utf16Index = previousUtf16Index;
            }

            // Count the number of UTF16 characters before text8[index]
            int currentLength = 0;

            for (int i = startIndex; i <= index - 1; ++i)
            {
                byte c = text8.Array[i + text8.Index];

                if (c >= 0x80 && c < 0xC0)
                {
                    // Continuation Byte; don't count (we figure out the counts from the first byte only)
                }
                else
                {
                    // Add the previous character length (now that it's done)
                    utf16Index += currentLength;

                    // This character will be one UTF-16 char if it's 1-3 bytes, and two if it's 4 bytes (0xF0+)
                    currentLength = c < 0xF0 ? 1 : 2;
                }
            }

            // If the character being pointed to isn't a continuation character, it's a new index
            byte last = index < text8.Length ? text8.Array[index + text8.Index] : (byte)0;

            if (!(last >= 0x80 && last < 0xC0))
            {
                utf16Index += currentLength;
            }

            return(utf16Index);
        }
 /// <summary>
 /// Initializes a new instance of the <see cref="FlexString"/> class.
 /// </summary>
 /// <param name="value">String8 value to wrap.</param>
 public FlexString(String8 value)
 {
     _string8            = value;
     _isString8Available = true;
 }