public bool TryCompression(Table table, TableSchema schema) { try { var tx = table._tx; int maxSpace = ZstdLib.GetMaxCompression(RawBuffer.Length); _compressedScope = tx.Allocator.Allocate(maxSpace + OverheadSize, out CompressedBuffer); Compressed = false; var compressionDictionary = tx.LowLevelTransaction.Environment.CompressionDictionariesHolder .GetCompressionDictionaryFor(tx, table.CurrentCompressionDictionaryId); CompressionTried = true; var size = ZstdLib.Compress(RawBuffer.ToReadOnlySpan(), CompressedBuffer.ToSpan(), compressionDictionary); size += WriteVariableSizeIntInReverse(CompressedBuffer.Ptr + size, compressionDictionary.Id); CompressedBuffer.Truncate(size); var compressionRatio = GetCompressionRatio(size, RawBuffer.Length); if (compressionRatio > compressionDictionary.ExpectedCompressionRatio + 10) { // training dictionaries is expensive, only do that if we see that the current compressed // value is significantly worse than the previous one var etagTree = table.GetFixedSizeTree(schema.CompressedEtagSourceIndex); if (ShouldRetrain(etagTree)) { MaybeTrainCompressionDictionary(table, etagTree); } } if (CompressedBuffer.Length >= RawBuffer.Length) { // we compressed too large, so we skip compression here _compressedScope.Dispose(); // Explicitly not disposing this, we need to have the raw buffer // when we do update then insert and the size is too large // RawScope.Dispose(); Compressed = false; return(false); } Compressed = true; return(true); } catch { _compressedScope.Dispose(); RawScope.Dispose(); throw; } }
private void MaybeTrainCompressionDictionary(Table table, FixedSizeTree etagsTree) { // the idea is that we'll get better results by including the most recently modified documents // by iterating over the tag index, which is guaranteed to be always increasing var dataIds = ArrayPool <long> .Shared.Rent(256); var sizes = ArrayPool <UIntPtr> .Shared.Rent(256); try { int used = 0; var totalSize = 0; int totalSkipped = 0; using (var it = etagsTree.Iterate()) { if (it.SeekToLast() == false) { return; // empty table, nothing to train on } do { long id = it.CreateReaderForCurrent().ReadLittleEndianInt64(); table.DirectRead(id, out var size); if (size > 32 * 1024) { if (totalSkipped++ > 16 * 1024) { return; // we are scanning too much, no need to try this hard } // we don't want to skip documents that are too big, they will compress // well on their own, and likely be *too* unique to add meaningfully to the // dictionary continue; } sizes[used] = (UIntPtr)size; dataIds[used++] = id; totalSize += size; } while (used < 256 && it.MovePrev() && totalSize < 1024 * 1024); } if (used < 16) { return;// too few samples to measure } var tx = table._tx; using (tx.Allocator.Allocate(totalSize, out var buffer)) { var cur = buffer.Ptr; for (int i = 0; i < used; i++) { var ptr = table.DirectRead(dataIds[i], out var size); Memory.Copy(cur, ptr, size); cur += size; } using (tx.Allocator.Allocate( // the dictionary Constants.Storage.PageSize - PageHeader.SizeOf - sizeof(CompressionDictionaryInfo) , out var dictionaryBuffer)) { Span <byte> dictionaryBufferSpan = dictionaryBuffer.ToSpan(); ZstdLib.Train(new ReadOnlySpan <byte>(buffer.Ptr, totalSize), new ReadOnlySpan <UIntPtr>(sizes, 0, used), ref dictionaryBufferSpan); var dictionariesTree = tx.CreateTree(TableSchema.CompressionDictionariesSlice); var newId = (int)(dictionariesTree.State.NumberOfEntries + 1); using var compressionDictionary = new ZstdLib.CompressionDictionary(newId, dictionaryBuffer.Ptr, dictionaryBufferSpan.Length, 3); if (ShouldReplaceDictionary(tx, compressionDictionary) == false) { return; } table.CurrentCompressionDictionaryId = newId; compressionDictionary.ExpectedCompressionRatio = GetCompressionRatio(CompressedBuffer.Length, RawBuffer.Length); var rev = Bits.SwapBytes(newId); using (Slice.External(tx.Allocator, (byte *)&rev, sizeof(int), out var slice)) using (dictionariesTree.DirectAdd(slice, sizeof(CompressionDictionaryInfo) + dictionaryBufferSpan.Length, out var dest)) { *((CompressionDictionaryInfo *)dest) = new CompressionDictionaryInfo { ExpectedCompressionRatio = compressionDictionary.ExpectedCompressionRatio }; Memory.Copy(dest + sizeof(CompressionDictionaryInfo), dictionaryBuffer.Ptr, dictionaryBufferSpan.Length); } tx.LowLevelTransaction.OnDispose += RecreateRecoveryDictionaries; } } } finally { ArrayPool <long> .Shared.Return(dataIds); ArrayPool <UIntPtr> .Shared.Return(sizes); } }