public void Utf8ToUtf16_All() { // Validate that index conversions on "a*b" map correctly for every Unicode codepoint byte[] buffer = new byte[10]; String8.Convert("a", ref buffer); for (int codepoint = 0; codepoint < 0x10FFFF; ++codepoint) { // Skip illegal codepoints if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { continue; } // Convert the codepoint to UTF16 string value = char.ConvertFromUtf32(codepoint); // Append it to the String8 after 'a' String8 value8 = String8.Convert(value, ref buffer, 1); // Append 'b' after that String8.Convert("b", ref buffer, value8.Index + value8.Length); // Map the whole value String8 whole8 = new String8(buffer, 0, value8.Index + value8.Length + 1); // 'a' should always map to index 0 Assert.Equal(0, String8.Utf8ToUtf16(0, whole8)); // 'b' should always map to the last .NET char (the length needed for the .NET string + 1 for 'a') Assert.Equal(1 + value.Length, String8.Utf8ToUtf16(whole8.Length - 1, whole8)); // All indices in between are the middle character (index 1, since it's after 'a') for (int i = 1; i < whole8.Length - 1; ++i) { Assert.Equal(1, String8.Utf8ToUtf16(i, whole8)); } } }
public static String8 ConvertExpensively(string value) { byte[] buffer = null; return(String8.Convert(value, ref buffer)); }
public bool Equals(String8 other) { return(CompareTo(other) == 0); }
/// <summary> /// Translate a UTF-8 index (from an RE2 match) to a UTF-16 match (safe for indexing into a .NET string). /// Takes an optional previous mapping, so it doesn't have to rescan from the beginning of the file each time. /// </summary> /// <remarks> /// <para> /// UTF-8: /// 0xxxxxxx [lt 0x80] - First byte of single byte character /// 10xxxxxx [lt 0xC0] - Non-first byte of any multi-byte character /// 110xxxxx [lt 0xE0] - First byte of two byte character /// 1110xxxx [lt 0xF0] - First byte of three byte character /// 11110xxx [lt 0xF8] - First byte of four byte character. /// </para> /// <para> /// All Unicode codepoints up to U+FFFF fit in one UTF-16 char. /// UTF-8 can store 4 + 6 + 6 bits = 16 bits in a three byte encoding, /// so all UTF-8 three byte and smaller characters will be one UTF-16 character. /// </para> /// </remarks> /// <param name="index">UTF-8 index to translate.</param> /// <param name="text8">String8 value.</param> /// <param name="previousUtf8Index">A previous UTF8 index translated, if available.</param> /// <param name="previousUtf16Index">The UTF-16 equivalent of the previous index, if available.</param> /// <returns>UTF-16 index corresponding to the UTF-8 index passed in.</returns> public static int Utf8ToUtf16(int index, String8 text8, int previousUtf8Index = 0, int previousUtf16Index = 0) { if (index < 0 || index > text8.Length) { throw new ArgumentOutOfRangeException(nameof(index)); } if (text8.Array == null) { throw new ArgumentNullException("text.Array"); } if (text8.Index < 0 || text8.Length < 0 || text8.Index + text8.Length > text8.Array.Length) { throw new ArgumentOutOfRangeException("text"); } // The first character is always the same if (index == 0) { return(0); } int startIndex = 0; int utf16Index = 0; // Track from the previous match, if it's available and before the current one if (previousUtf8Index > 0 && previousUtf8Index <= index) { startIndex = previousUtf8Index; utf16Index = previousUtf16Index; } // Count the number of UTF16 characters before text8[index] int currentLength = 0; for (int i = startIndex; i <= index - 1; ++i) { byte c = text8.Array[i + text8.Index]; if (c >= 0x80 && c < 0xC0) { // Continuation Byte; don't count (we figure out the counts from the first byte only) } else { // Add the previous character length (now that it's done) utf16Index += currentLength; // This character will be one UTF-16 char if it's 1-3 bytes, and two if it's 4 bytes (0xF0+) currentLength = c < 0xF0 ? 1 : 2; } } // If the character being pointed to isn't a continuation character, it's a new index byte last = index < text8.Length ? text8.Array[index + text8.Index] : (byte)0; if (!(last >= 0x80 && last < 0xC0)) { utf16Index += currentLength; } return(utf16Index); }
/// <summary> /// Initializes a new instance of the <see cref="FlexString"/> class. /// </summary> /// <param name="value">String8 value to wrap.</param> public FlexString(String8 value) { _string8 = value; _isString8Available = true; }