public static Slice[] Merge(Slice prefix, [NotNull] Slice[] keys) { if (prefix == null) { throw new ArgumentNullException("prefix"); } if (keys == null) { throw new ArgumentNullException("keys"); } //REVIEW: merge this code with Slice.ConcatRange! // we can pre-allocate exactly the buffer by computing the total size of all keys int size = keys.Sum(key => key.Count) + keys.Length * prefix.Count; var writer = new SliceWriter(size); var next = new List <int>(keys.Length); //TODO: use multiple buffers if item count is huge ? foreach (var key in keys) { if (prefix.IsPresent) { writer.WriteBytes(prefix); } writer.WriteBytes(key); next.Add(writer.Position); } return(FdbKey.SplitIntoSegments(writer.Buffer, 0, next)); }
/// <summary>Merge an array of keys with a same prefix, all sharing the same buffer</summary> /// <param name="prefix">Prefix shared by all keys</param> /// <param name="keys">Array of keys to pack</param> /// <returns>Array of slices (for all keys) that share the same underlying buffer</returns> public static Slice[] Merge(Slice prefix, Slice[] keys) { if (prefix.IsNull) { throw new ArgumentNullException(nameof(prefix)); } Contract.NotNull(keys); //REVIEW: merge this code with Slice.ConcatRange! // we can pre-allocate exactly the buffer by computing the total size of all keys int size = keys.Sum(key => key.Count) + keys.Length * prefix.Count; var writer = new SliceWriter(size); var next = new List <int>(keys.Length); //TODO: use multiple buffers if item count is huge ? var prefixSpan = prefix.Span; foreach (var key in keys) { if (prefixSpan.Length != 0) { writer.WriteBytes(prefixSpan); } writer.WriteBytes(key.Span); next.Add(writer.Position); } return(SplitIntoSegments(writer.Buffer, 0, next)); }
public SliceWriter OpenWriter(int extra = 32) { var key = GetKeyPrefix(); var sw = new SliceWriter(key.Count + extra); //TODO: BufferPool ? sw.WriteBytes(key); return(sw); }
private static unsafe bool TryWriteUnescapedUtf8String(ref SliceWriter writer, char* chars, int count) { Contract.Requires(chars != null && count >= 0); // Several observations: // * Most strings will be keywords or ASCII-only with no zeroes. These can be copied directly to the buffer // * We will only attempt to optimze strings that don't have any 00 to escape to 00 FF. For these, we will fallback to converting to byte[] then escaping. // * Since .NET's strings are UTF-16, the max possible UNICODE value to encode is 0xFFFF, which takes 3 bytes in UTF-8 (EF BF BF) // * Most western europe languages have only a few non-ASCII chars here and there, and most of them will only use 2 bytes (ex: 'é' => 'C3 A9') // * More complex scripts with dedicated symbol pages (kanjis, arabic, ....) will take 2 or 3 bytes for each charecter. // We will first do a pass to check for the presence of 00 and non-ASCII chars // => if we find at least on 00, we fallback to escaping the result of Encoding.UTF8.GetBytes() // => if we find only ASCII (1..127) chars, we have an optimized path that will truncate the chars to bytes // => if not, we will use an UTF8Encoder to convert the string to UTF-8, in chunks, using a small buffer allocated on the stack #region First pass: look for \0 and non-ASCII chars // fastest way to check for non-ASCII, is to OR all the chars together, and look at bits 7 to 15. If they are not all zero, there is at least ONE non-ASCII char. // also, we abort as soon as we find a \0 char* ptr = chars; char* end = chars + count; char mask = '\0', c; while (ptr < end && (c = *ptr) != '\0') { mask |= c; ++ptr; } if (ptr < end) return false; // there is at least one \0 in the string // bit 7-15 all unset means the string is pure ASCII if ((mask >> 7) == 0) { // => directly dump the chars to the buffer WriteUnescapedAsciiChars(ref writer, chars, count); return true; } #endregion #region Second pass: encode the string to UTF-8, in chunks // Here we know that there is at least one unicode char, and that there are no \0 // We will tterate through the string, filling as much of the buffer as possible bool done; int charsUsed, bytesUsed; int remaining = count; ptr = chars; // We need at most 3 * CHUNK_SIZE to encode the chunk // > For small strings, we will allocated exactly string.Length * 3 bytes, and will be done in one chunk // > For larger strings, we will call encoder.Convert(...) until it says it is done. const int CHUNK_SIZE = 1024; int bufLen = Encoding.UTF8.GetMaxByteCount(Math.Min(count, CHUNK_SIZE)); byte* buf = stackalloc byte[bufLen]; // We can not really predict the final size of the encoded string, but: // * Western languages have a few chars that usually need 2 bytes. If we pre-allocate 50% more bytes, it should fit most of the time, without too much waste // * Eastern langauges will have all chars encoded to 3 bytes. If we also pre-allocated 50% more, we should only need one resize of the buffer (150% x 2 = 300%), which is acceptable writer.EnsureBytes(checked(2 + count + (count >> 1))); // preallocate 150% of the string + 2 bytes writer.UnsafeWriteByte(FdbTupleTypes.Utf8); var encoder = Encoding.UTF8.GetEncoder(); // note: encoder.Convert() tries to fill up the buffer as much as possible with complete chars, and will set 'done' to true when all chars have been converted. do { encoder.Convert(ptr, remaining, buf, bufLen, true, out charsUsed, out bytesUsed, out done); if (bytesUsed > 0) { writer.WriteBytes(buf, bytesUsed); } remaining -= charsUsed; ptr += charsUsed; } while (!done); Contract.Assert(remaining == 0 && ptr == end); // close the string writer.WriteByte(0x00); #endregion return true; }