/// <summary> /// Interprets stored bytes as UTF8 bytes, returning the /// resulting <see cref="string"/>. /// </summary> public string Utf8ToString() { CharsRef @ref = new CharsRef(Length); UnicodeUtil.UTF8toUTF16(bytes, Offset, Length, @ref); return(@ref.ToString()); }
/// <summary> /// Interprets the given byte array as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. /// <para/> /// NOTE: Full characters are read, even if this reads past the length passed (and /// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// </summary> // TODO: broken if chars.offset != 0 public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) { int out_offset = chars.Offset = 0; char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length); int limit = offset + length; while (offset < limit) { int b = utf8[offset++] & 0xff; if (b < 0xc0) { if (Debugging.AssertsEnabled) { Debugging.Assert(b < 0x80); } @out[out_offset++] = (char)b; } else if (b < 0xe0) { @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); } else if (b < 0xf0) { @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); offset += 2; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); } int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); offset += 3; if (ch < UNI_MAX_BMP) { @out[out_offset++] = (char)ch; } else { int chHalf = ch - 0x0010000; @out[out_offset++] = (char)((chHalf >> 10) + 0xD800); @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00); } } } chars.Length = out_offset - chars.Offset; }
public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars); }