private void CompressionWorker(BlockingCollection <Block> toCompress, BlockingCollection <Block> toWrite, int columns, OrderedWaiter waiter, ExceptionMarshaller exMarshaller) { Contracts.AssertValue(exMarshaller); try { _host.AssertValue(toCompress); _host.AssertValue(toWrite); _host.Assert(columns > 0); _host.Assert(_deterministicBlockOrder == (waiter != null)); foreach (Block block in toCompress.GetConsumingEnumerable(exMarshaller.Token)) { MemoryStream compressed = _memPool.Get(); int uncompLength; using (Stream stream = _compression.CompressStream(compressed)) { MemoryStream uncompressed = block.BlockData; uncompLength = (int)uncompressed.Length; ArraySegment <byte> buffer; bool tmp = uncompressed.TryGetBuffer(out buffer); Contracts.Assert(tmp); stream.Write(buffer.Array, buffer.Offset, buffer.Count); _memPool.Return(ref uncompressed); } if (_deterministicBlockOrder) { waiter.Wait((long)columns * block.BlockIndex + block.ColumnIndex, exMarshaller.Token); } toWrite.Add(new Block(compressed, block.ColumnIndex, block.BlockIndex, uncompLength), exMarshaller.Token); if (_deterministicBlockOrder) { waiter.Increment(); } } } catch (Exception ex) { exMarshaller.Set("compressing", ex); } }
public void SaveData(Stream stream, IDataView data, params int[] colIndices) { _host.CheckValue(stream, nameof(stream)); _host.CheckValue(data, nameof(data)); _host.CheckValueOrNull(colIndices); _host.CheckParam(stream.CanWrite, nameof(stream), "cannot save to non-writable stream"); _host.CheckParam(stream.CanSeek, nameof(stream), "cannot save to non-seekable stream"); _host.CheckParam(stream.Position == 0, nameof(stream), "stream must be positioned at head of stream"); using (IChannel ch = _host.Start("Saving")) using (ExceptionMarshaller exMarshaller = new ExceptionMarshaller()) { var toWrite = new BlockingCollection <Block>(16); var toCompress = new BlockingCollection <Block>(16); var activeColumns = GetActiveColumns(data.Schema, colIndices); int rowsPerBlock = RowsPerBlockHeuristic(data, activeColumns); ch.Assert(rowsPerBlock > 0); Stopwatch sw = new Stopwatch(); // Set up the compression and write workers that consume the input information first. Task compressionTask = null; if (activeColumns.Length > 0) { OrderedWaiter waiter = _deterministicBlockOrder ? new OrderedWaiter() : null; Thread[] compressionThreads = new Thread[Environment.ProcessorCount]; for (int i = 0; i < compressionThreads.Length; ++i) { compressionThreads[i] = Utils.CreateBackgroundThread( () => CompressionWorker(toCompress, toWrite, activeColumns.Length, waiter, exMarshaller)); compressionThreads[i].Start(); } compressionTask = new Task(() => { foreach (Thread t in compressionThreads) { t.Join(); } }); compressionTask.Start(); } // While there is an advantage to putting the IO into a separate thread, there is not an // advantage to having more than one worker. Thread writeThread = Utils.CreateBackgroundThread( () => WriteWorker(stream, toWrite, activeColumns, data.Schema, rowsPerBlock, _host, exMarshaller)); writeThread.Start(); sw.Start(); // REVIEW: For now the fetch worker just works in the main thread. If it's // a fairly large view through, it may be advantageous to consider breaking up the // fetchwrite operations on the pipes, somehow. // Despite running in the main thread for now, the fetch worker follows the same // pattern of utilizing exMarshaller. using (var pch = _silent ? null : _host.StartProgressChannel("BinarySaver")) { FetchWorker(toCompress, data, activeColumns, rowsPerBlock, sw, ch, pch, exMarshaller); } _host.Assert(compressionTask != null || toCompress.IsCompleted); if (compressionTask != null) { compressionTask.Wait(); } toWrite.CompleteAdding(); writeThread.Join(); exMarshaller.ThrowIfSet(ch); if (!_silent) { ch.Info("Wrote {0} rows across {1} columns in {2}", _rowCount, activeColumns.Length, sw.Elapsed); } // When we dispose the exception marshaller, this will set the cancellation token when we internally // dispose the cancellation token source, so one way or another those threads are being cancelled, even // if an exception is thrown in the main body of this function. } }
private void FetchWorker(BlockingCollection <Block> toCompress, IDataView data, ColumnCodec[] activeColumns, int rowsPerBlock, Stopwatch sw, IChannel ch, IProgressChannel pch, ExceptionMarshaller exMarshaller) { Contracts.AssertValue(ch); Contracts.AssertValueOrNull(pch); ch.AssertValue(exMarshaller); try { ch.AssertValue(toCompress); ch.AssertValue(data); ch.AssertValue(activeColumns); ch.AssertValue(sw); ch.Assert(rowsPerBlock > 0); // The main thread handles fetching from the cursor, and storing it into blocks passed to toCompress. HashSet <int> activeSet = new HashSet <int>(activeColumns.Select(col => col.SourceIndex)); long blockIndex = 0; int remainingInBlock = rowsPerBlock; using (RowCursor cursor = data.GetRowCursor(activeSet.Contains)) { WritePipe[] pipes = new WritePipe[activeColumns.Length]; for (int c = 0; c < activeColumns.Length; ++c) { pipes[c] = WritePipe.Create(this, cursor, activeColumns[c]); } for (int c = 0; c < pipes.Length; ++c) { pipes[c].BeginBlock(); } long rows = 0; if (pch != null) { pch.SetHeader(new ProgressHeader(new[] { "rows" }), e => e.SetProgress(0, rows)); } while (cursor.MoveNext()) { for (int c = 0; c < pipes.Length; ++c) { pipes[c].FetchAndWrite(); } if (--remainingInBlock == 0) { for (int c = 0; c < pipes.Length; ++c) { // REVIEW: It may be better if EndBlock got moved to a different worker thread. toCompress.Add(new Block(pipes[c].EndBlock(), c, blockIndex), exMarshaller.Token); pipes[c].BeginBlock(); } remainingInBlock = rowsPerBlock; blockIndex++; } rows++; } if (remainingInBlock < rowsPerBlock) { for (int c = 0; c < pipes.Length; ++c) { toCompress.Add(new Block(pipes[c].EndBlock(), c, blockIndex), exMarshaller.Token); } } Contracts.Assert(rows == (blockIndex + 1) * rowsPerBlock - remainingInBlock); _rowCount = rows; if (pch != null) { pch.Checkpoint(rows); } } toCompress.CompleteAdding(); } catch (Exception ex) { exMarshaller.Set("cursoring", ex); } }
private void WriteWorker(Stream stream, BlockingCollection <Block> toWrite, ColumnCodec[] activeColumns, Schema sourceSchema, int rowsPerBlock, IChannelProvider cp, ExceptionMarshaller exMarshaller) { _host.AssertValue(exMarshaller); try { _host.AssertValue(cp); cp.AssertValue(stream); cp.AssertValue(toWrite); cp.AssertValue(activeColumns); cp.AssertValue(sourceSchema); cp.Assert(rowsPerBlock > 0); using (IChannel ch = cp.Start("Write")) { var blockLookups = new List <BlockLookup> [activeColumns.Length]; for (int c = 0; c < blockLookups.Length; ++c) { blockLookups[c] = new List <BlockLookup>(); } var deadLookups = new int[activeColumns.Length]; // Reserve space for the header at the start. This will be filled // in with valid values once writing has completed. ch.CheckIO(stream.Position == 0); stream.Write(new byte[Header.HeaderSize], 0, Header.HeaderSize); ch.CheckIO(stream.Position == Header.HeaderSize); long expectedPosition = stream.Position; BlockLookup deadLookup = new BlockLookup(); foreach (Block block in toWrite.GetConsumingEnumerable(exMarshaller.Token)) { ch.CheckIO(stream.Position == expectedPosition); MemoryStream compressed = block.BlockData; ArraySegment <byte> buffer; bool tmp = compressed.TryGetBuffer(out buffer); ch.Assert(tmp); stream.Write(buffer.Array, buffer.Offset, buffer.Count); BlockLookup currLookup = new BlockLookup(expectedPosition, (int)compressed.Length, block.UncompressedLength); expectedPosition += compressed.Length; _memPool.Return(ref compressed); ch.CheckIO(stream.Position == expectedPosition); // Record the position. We have this "lookups" list per column. Yet, it may be that sometimes // the writer receives things out of order. // REVIEW: The format and the rest of the pipeline supposedly supports a long number // of blocks, but the writing scheme does not yet support that. int blockIndex = (int)block.BlockIndex; var lookups = blockLookups[block.ColumnIndex]; if (lookups.Count == block.BlockIndex) // Received in order. { lookups.Add(currLookup); } else if (lookups.Count < block.BlockIndex) // Received a block a little bit early. { // Add a bunch of dead filler lookups, until these late blocks come in. int deadToAdd = (int)block.BlockIndex - lookups.Count; for (int i = 0; i < deadToAdd; ++i) { lookups.Add(deadLookup); } deadLookups[block.ColumnIndex] += deadToAdd; ch.Assert(lookups.Count == block.BlockIndex); lookups.Add(currLookup); } else // Received a block a little bit late. { // This should be a dead block unless the compressors are buggy and somehow // yielding duplicate blocks or something. ch.Assert(lookups[blockIndex].BlockOffset == 0); deadLookups[block.ColumnIndex]--; lookups[blockIndex] = currLookup; } } // We have finished writing all blocks. We will now write the block lookup tables (so we can // find the blocks), the slot names (for any columns that have them), the column table of // contents (so we know how to decode the blocks, and where the lookups and names are), // and the header (so we know dataview wide information and where to find the table of // contents) in that order. long[] lookupOffsets = new long[blockLookups.Length]; using (BinaryWriter writer = new BinaryWriter(stream, Encoding.UTF8, leaveOpen: true)) { // Write the block lookup directories. These are referenced from the table of contents, // so that someone knows where to look for some block data. for (int c = 0; c < blockLookups.Length; ++c) { ch.Assert(deadLookups[c] == 0); // The block lookup directories are written uncompressed and in fixed length // to enable rapid seeking. lookupOffsets[c] = stream.Position; foreach (BlockLookup lookup in blockLookups[c]) { // *** Lookup table entry format *** // long: Offset to the start of a block // int: Byte length of block as written // int: Byte length of block when uncompressed ch.Assert(lookup.BlockOffset > 0); writer.Write(lookup.BlockOffset); writer.Write(lookup.BlockLength); writer.Write(lookup.DecompressedBlockLength); } ch.CheckIO(stream.Position == lookupOffsets[c] + (16 * blockLookups[c].Count), "unexpected offsets after block lookup table write"); } // Write the metadata for each column. long[] metadataTocOffsets = new long[activeColumns.Length]; for (int c = 0; c < activeColumns.Length; ++c) { metadataTocOffsets[c] = WriteMetadata(writer, sourceSchema, activeColumns[c].SourceIndex, ch); } // Write the table of contents. long tocOffset = stream.Position; { int c = 0; expectedPosition = stream.Position; foreach (var active in activeColumns) { // *** Column TOC entry format *** // string: column name // codec (as interpretable by CodecFactory.TryGetCodec): column block codec // CompressionKind(byte): block compression strategy // LEB128 int: Rows per block // long: Offset to the start of the lookup table // long: Offset to the start of the metadata TOC entries, or 0 if this has no metadata string name = sourceSchema[active.SourceIndex].Name; writer.Write(name); int nameLen = Encoding.UTF8.GetByteCount(name); expectedPosition += Utils.Leb128IntLength((uint)nameLen) + nameLen; ch.CheckIO(stream.Position == expectedPosition, "unexpected offsets after table of contents name"); expectedPosition += _factory.WriteCodec(stream, active.Codec); ch.CheckIO(stream.Position == expectedPosition, "unexpected offsets after table of contents type description"); writer.Write((byte)_compression); expectedPosition++; // REVIEW: Right now the number of rows per block is fixed, so we // write the same value each time. In some future state, it may be that this // is relaxed, with possibly some tradeoffs (for example, inability to randomly seek). writer.WriteLeb128Int((ulong)rowsPerBlock); expectedPosition += Utils.Leb128IntLength((uint)rowsPerBlock); // Offset of the lookup table. writer.Write(lookupOffsets[c]); expectedPosition += sizeof(long); // Offset of the metadata table of contents. writer.Write(metadataTocOffsets[c]); expectedPosition += sizeof(long); ch.CheckIO(stream.Position == expectedPosition, "unexpected offsets after table of contents"); c++; } } // Write the tail signature. long tailOffset = stream.Position; writer.Write(Header.TailSignatureValue); // Now move back to the beginning of the stream, and write out the now completed header. Header header = new Header() { Signature = Header.SignatureValue, Version = Header.WriterVersion, CompatibleVersion = Header.CanBeReadByVersion, TableOfContentsOffset = tocOffset, TailOffset = tailOffset, RowCount = _rowCount, ColumnCount = activeColumns.Length }; byte[] headerBytes = new byte[Header.HeaderSize]; unsafe { Marshal.Copy(new IntPtr(&header), headerBytes, 0, Marshal.SizeOf(typeof(Header))); } writer.Seek(0, SeekOrigin.Begin); writer.Write(headerBytes); } } } catch (Exception ex) { exMarshaller.Set("writing", ex); } }