/// <summary>Writes a char encoded in UTF-8</summary> public static void WriteChar(ref SliceWriter writer, char value) { if (value == 0) { // NUL => "00 0F" // note: \0 is the only unicode character that will produce a zero byte when converted in UTF-8 writer.WriteByte4(FdbTupleTypes.Utf8, 0x00, 0xFF, 0x00); } else if (value < 0x80) { // 0x00..0x7F => 0xxxxxxx writer.WriteByte3(FdbTupleTypes.Utf8, (byte)value, 0x00); } else if (value < 0x800) { // 0x80..0x7FF => 110xxxxx 10xxxxxx => two bytes writer.WriteByte4(FdbTupleTypes.Utf8, (byte)(0xC0 | (value >> 6)), (byte)(0x80 | (value & 0x3F)), 0x00); } else { // 0x800..0xFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // note: System.Char is 16 bits, and thus cannot represent UNICODE chars above 0xFFFF. // => This means that a System.Char will never take more than 3 bytes in UTF-8 ! var tmp = Encoding.UTF8.GetBytes(new string(value, 1)); writer.EnsureBytes(tmp.Length + 2); writer.UnsafeWriteByte(FdbTupleTypes.Utf8); writer.UnsafeWriteBytes(tmp, 0, tmp.Length); writer.UnsafeWriteByte(0x00); } }
/// <summary>Writes a 64-bit UUID</summary> public static void WriteUuid64(ref SliceWriter writer, Uuid64 value) { writer.EnsureBytes(9); writer.UnsafeWriteByte(FdbTupleTypes.Uuid64); unsafe { byte *ptr = stackalloc byte[8]; value.WriteTo(ptr); writer.UnsafeWriteBytes(ptr, 8); } }
/// <summary>Writes a RFC 4122 encoded 128-bit UUID</summary> public static void WriteUuid128(ref SliceWriter writer, Uuid128 value) { writer.EnsureBytes(17); writer.UnsafeWriteByte(FdbTupleTypes.Uuid128); unsafe { byte *ptr = stackalloc byte[16]; value.WriteTo(ptr); writer.UnsafeWriteBytes(ptr, 16); } }
/// <summary>Writes a RFC 4122 encoded 16-byte Microsoft GUID</summary> public static void WriteGuid(ref SliceWriter writer, Guid value) { writer.EnsureBytes(17); writer.UnsafeWriteByte(FdbTupleTypes.Uuid128); unsafe { // UUIDs are stored using the RFC 4122 standard, so we need to swap some parts of the System.Guid byte *ptr = stackalloc byte[16]; Uuid128.Write(value, ptr); writer.UnsafeWriteBytes(ptr, 16); } }
private static unsafe bool TryWriteUnescapedUtf8String(ref SliceWriter writer, char *chars, int count) { Contract.Requires(chars != null && count >= 0); // Several observations: // * Most strings will be keywords or ASCII-only with no zeroes. These can be copied directly to the buffer // * We will only attempt to optimze strings that don't have any 00 to escape to 00 FF. For these, we will fallback to converting to byte[] then escaping. // * Since .NET's strings are UTF-16, the max possible UNICODE value to encode is 0xFFFF, which takes 3 bytes in UTF-8 (EF BF BF) // * Most western europe languages have only a few non-ASCII chars here and there, and most of them will only use 2 bytes (ex: 'é' => 'C3 A9') // * More complex scripts with dedicated symbol pages (kanjis, arabic, ....) will take 2 or 3 bytes for each charecter. // We will first do a pass to check for the presence of 00 and non-ASCII chars // => if we find at least on 00, we fallback to escaping the result of Encoding.UTF8.GetBytes() // => if we find only ASCII (1..127) chars, we have an optimized path that will truncate the chars to bytes // => if not, we will use an UTF8Encoder to convert the string to UTF-8, in chunks, using a small buffer allocated on the stack #region First pass: look for \0 and non-ASCII chars // fastest way to check for non-ASCII, is to OR all the chars together, and look at bits 7 to 15. If they are not all zero, there is at least ONE non-ASCII char. // also, we abort as soon as we find a \0 char *ptr = chars; char *end = chars + count; char mask = '\0', c; while (ptr < end && (c = *ptr) != '\0') { mask |= c; ++ptr; } if (ptr < end) { return(false); // there is at least one \0 in the string } // bit 7-15 all unset means the string is pure ASCII if ((mask >> 7) == 0) { // => directly dump the chars to the buffer WriteUnescapedAsciiChars(ref writer, chars, count); return(true); } #endregion #region Second pass: encode the string to UTF-8, in chunks // Here we know that there is at least one unicode char, and that there are no \0 // We will tterate through the string, filling as much of the buffer as possible bool done; int remaining = count; ptr = chars; // We need at most 3 * CHUNK_SIZE to encode the chunk // > For small strings, we will allocated exactly string.Length * 3 bytes, and will be done in one chunk // > For larger strings, we will call encoder.Convert(...) until it says it is done. const int CHUNK_SIZE = 1024; int bufLen = Encoding.UTF8.GetMaxByteCount(Math.Min(count, CHUNK_SIZE)); byte * buf = stackalloc byte[bufLen]; // We can not really predict the final size of the encoded string, but: // * Western languages have a few chars that usually need 2 bytes. If we pre-allocate 50% more bytes, it should fit most of the time, without too much waste // * Eastern langauges will have all chars encoded to 3 bytes. If we also pre-allocated 50% more, we should only need one resize of the buffer (150% x 2 = 300%), which is acceptable writer.EnsureBytes(checked (2 + count + (count >> 1))); // preallocate 150% of the string + 2 bytes writer.UnsafeWriteByte(FdbTupleTypes.Utf8); var encoder = Encoding.UTF8.GetEncoder(); // note: encoder.Convert() tries to fill up the buffer as much as possible with complete chars, and will set 'done' to true when all chars have been converted. do { int charsUsed, bytesUsed; encoder.Convert(ptr, remaining, buf, bufLen, true, out charsUsed, out bytesUsed, out done); if (bytesUsed > 0) { writer.WriteBytes(buf, bytesUsed); } remaining -= charsUsed; ptr += charsUsed; }while (!done); Contract.Assert(remaining == 0 && ptr == end); // close the string writer.WriteByte(0x00); #endregion return(true); }