/// <summary> /// A utility method to save a column type and value to a stream, if we have a codec for that. /// </summary> /// <param name="stream">The stream to write the type and value to</param> /// <param name="type">The type of the codec to write and utilize</param> /// <param name="value">The value to encode and write</param> /// <param name="bytesWritten">The number of bytes written</param> /// <returns>Whether the write was successful or not</returns> public bool TryWriteTypeAndValue <T>(Stream stream, ColumnType type, ref T value, out int bytesWritten) { _host.CheckValue(stream, nameof(stream)); _host.CheckValue(type, nameof(type)); _host.CheckParam(value.GetType() == type.RawType, nameof(value), "Value doesn't match type"); IValueCodec codec; if (!_factory.TryGetCodec(type, out codec)) { bytesWritten = 0; return(false); } IValueCodec <T> codecT = (IValueCodec <T>)codec; bytesWritten = _factory.WriteCodec(stream, codec); using (var writer = codecT.OpenWriter(stream)) { writer.Write(in value); bytesWritten += (int)writer.GetCommitLengthEstimate(); writer.Commit(); } return(true); }
/// <summary> /// Given a codec, write a type description to a stream, from which this codec can be /// reconstructed later. This returns the number of bytes written, so that, if this /// were a seekable stream, the positions would differ by this amount before and after /// a call to this method. /// </summary> public int WriteCodec(Stream definitionStream, IValueCodec codec) { // *** Codec type description *** // string: codec loadname // LEB128 int: Byte size of the parameterization // byte[]: The indicated parameterization using (BinaryWriter writer = OpenBinaryWriter(definitionStream)) { string loadName = codec.LoadName; writer.Write(loadName); int bytes = _encoding.GetByteCount(loadName); bytes = checked (bytes + Utils.Leb128IntLength((uint)bytes)); MemoryStream mem = _memPool.Get(); int output = codec.WriteParameterization(mem); Contracts.Check(mem.Length == output, "codec description length did not match stream length"); Contracts.Check(mem.Length <= int.MaxValue); // Is this even possible in the current implementation of MemoryStream? writer.WriteLeb128Int((ulong)mem.Length); bytes = checked (bytes + Utils.Leb128IntLength((uint)mem.Length) + output); mem.Position = 0; mem.CopyTo(definitionStream); _memPool.Return(ref mem); return(bytes); } }
/// <summary> /// Attempts to define a codec, given a stream positioned at the start of a serialized /// codec type definition. /// </summary> /// <param name="definitionStream">The input stream, which whether this returns true or false /// will be left at the end of the codec type definition</param> /// <param name="codec">A codec castable to a generic <c>IValueCodec{T}</c> where /// <c>typeof(T)==codec.Type.RawType</c></param> /// <returns>Whether the codec type definition was understood. If true the codec has defined /// value, and should be usable. If false, the name of the codec was unrecognized. Note that /// malformed definitions are detected, this will throw instead of returning either true or /// false.</returns> public bool TryReadCodec(Stream definitionStream, out IValueCodec codec) { Contracts.AssertValue(definitionStream, "definitionStream"); using (IChannel ch = _host.Start("TryGetCodec")) using (BinaryReader reader = new BinaryReader(definitionStream, Encoding.UTF8, true)) { string signature = reader.ReadString(); Contracts.CheckDecode(!string.IsNullOrEmpty(signature), "Non-empty signature string expected"); ulong ulen = reader.ReadLeb128Int(); Contracts.CheckDecode(ulen <= long.MaxValue, "Codec type definition read from stream too large"); long len = (long)ulen; GetCodecFromStreamDelegate del; if (!_loadNameToCodecCreator.TryGetValue(signature, out del)) { codec = default(IValueCodec); if (len == 0) { return(false); } // Move the stream past the end of the definition. if (definitionStream.CanSeek) { long remaining = definitionStream.Length - definitionStream.Position; if (remaining < len) { throw ch.ExceptDecode("Codec type definition supposedly has {0} bytes, but end-of-stream reached after {1} bytes", len, remaining); } definitionStream.Seek(len, SeekOrigin.Current); } else { for (long i = 0; i < len; ++i) { if (definitionStream.ReadByte() == -1) { throw ch.ExceptDecode("Codec type definition supposedly has {0} bytes, but end-of-stream reached after {1} bytes", len, i); } } } ch.Warning("Did not recognize value codec signature '{0}'", signature); ch.Done(); return(false); } // Opportunistically validate in the case of a seekable stream. long pos = definitionStream.CanSeek ? definitionStream.Position : -1; bool retval = del(definitionStream, out codec); if (definitionStream.CanSeek && definitionStream.Position - pos != len) { throw ch.ExceptDecode("Codec type definition supposedly has {0} bytes, but the handler consumed {1}", len, definitionStream.Position - pos); } ch.Done(); return(retval); } }
/// <summary> /// Deserializes and returns a value given a stream and codec. /// </summary> private object LoadValue <T>(Stream stream, IValueCodec <T> codec) { _host.Assert(typeof(T) == codec.Type.RawType); T value = default(T); using (var reader = codec.OpenReader(stream, 1)) { reader.MoveNext(); reader.Get(ref value); } return(value); }
public bool TryGetCodec(ColumnType type, out IValueCodec codec) { // Handle the primier types specially. if (type.IsKey) { return(GetKeyCodec(type, out codec)); } if (type.IsVector) { return(GetVBufferCodec(type, out codec)); } return(_simpleCodecTypeMap.TryGetValue(type.RawKind, out codec)); }
public bool TryGetCodec(DataViewType type, out IValueCodec codec) { // Handle the primier types specially. if (type is KeyType) { return(GetKeyCodec(type, out codec)); } if (type is VectorType vectorType) { return(GetVBufferCodec(vectorType, out codec)); } return(_simpleCodecTypeMap.TryGetValue(type.RawType, out codec)); }
private IValueCodec WriteMetadataCore <T>(Stream stream, Schema schema, int col, string kind, ColumnType type, out CompressionKind compressionKind) { _host.Assert(typeof(T) == type.RawType); IValueCodec generalCodec; if (!_factory.TryGetCodec(type, out generalCodec)) { compressionKind = default(CompressionKind); return(null); } IValueCodec <T> codec = (IValueCodec <T>)generalCodec; T value = default(T); schema[col].Metadata.GetValue(kind, ref value); // Metadatas will often be pretty small, so that compression makes no sense. // We try both a compressed and uncompressed version of metadata and // opportunistically pick whichever is smallest. MemoryStream uncompressedMem = _memPool.Get(); using (IValueWriter <T> writer = codec.OpenWriter(uncompressedMem)) { writer.Write(in value); writer.Commit(); } MemoryStream compressedMem = _memPool.Get(); ArraySegment <byte> buffer; bool tmp = uncompressedMem.TryGetBuffer(out buffer); _host.Assert(tmp); using (Stream compressStream = _compression.CompressStream(compressedMem)) compressStream.Write(buffer.Array, buffer.Offset, buffer.Count); if (uncompressedMem.Length <= compressedMem.Length) { // Write uncompressed. compressionKind = CompressionKind.None; } else { // Write compressed. compressionKind = _compression; tmp = compressedMem.TryGetBuffer(out buffer); _host.Assert(tmp); } stream.Write(buffer.Array, buffer.Offset, buffer.Count); _memPool.Return(ref uncompressedMem); _memPool.Return(ref compressedMem); return(codec); }
private void EstimatorCore <T>(RowCursor cursor, ColumnCodec col, out Func <long> fetchWriteEstimator, out IValueWriter writer) { ValueGetter <T> getter = cursor.GetGetter <T>(col.SourceIndex); IValueCodec <T> codec = col.Codec as IValueCodec <T>; _host.AssertValue(codec); IValueWriter <T> specificWriter = codec.OpenWriter(Stream.Null); writer = specificWriter; T val = default(T); fetchWriteEstimator = () => { getter(ref val); specificWriter.Write(in val); return(specificWriter.GetCommitLengthEstimate()); }; }
public ColumnCodec(int sourceIndex, IValueCodec codec) { SourceIndex = sourceIndex; Codec = codec; }
/// <summary> /// A helper method to query and write metadata to the stream. /// </summary> /// <param name="writer">A binary writer, which if metadata exists for the /// indicated column the base stream will be positioned just past the end of /// the written metadata table of contents, and if metadata does not exist /// remains unchanged</param> /// <param name="schema">The schema to query for metadat</param> /// <param name="col">The column we are attempting to get metadata for</param> /// <param name="ch">The channel to which we write any diagnostic information</param> /// <returns>The offset of the metadata table of contents, or 0 if there was /// no metadata</returns> private long WriteMetadata(BinaryWriter writer, Schema schema, int col, IChannel ch) { _host.AssertValue(writer); _host.AssertValue(schema); _host.Assert(0 <= col && col < schema.Count); int count = 0; WriteMetadataCoreDelegate del = WriteMetadataCore <int>; MethodInfo methInfo = del.GetMethodInfo().GetGenericMethodDefinition(); object[] args = new object[] { writer.BaseStream, schema, col, null, null, null }; List <long> offsets = new List <long>(); offsets.Add(writer.BaseStream.Position); var metadataInfos = new List <Tuple <string, IValueCodec, CompressionKind> >(); var kinds = new HashSet <string>(); // Write all metadata blocks for this column to the file, one after the other, keeping // track of the location and size of each for when we write the metadata table of contents. // (To be clear, this specific layout is not required by the format.) foreach (var metaColumn in schema[col].Metadata.Schema) { _host.Check(!string.IsNullOrEmpty(metaColumn.Name), "Metadata with null or empty kind detected, disallowed"); _host.Check(metaColumn.Type != null, "Metadata with null type detected, disallowed"); if (!kinds.Add(metaColumn.Name)) { throw _host.Except("Metadata with duplicate kind '{0}' encountered, disallowed", metaColumn.Name, schema[col].Name); } args[3] = metaColumn.Name; args[4] = metaColumn.Type; IValueCodec codec = (IValueCodec)methInfo.MakeGenericMethod(metaColumn.Type.RawType).Invoke(this, args); if (codec == null) { // Nothing was written. ch.Warning("Could not get codec for type {0}, dropping column '{1}' index {2} metadata kind '{3}'", metaColumn.Type, schema[col].Name, col, metaColumn.Name); continue; } offsets.Add(writer.BaseStream.Position); _host.CheckIO(offsets[offsets.Count - 1] > offsets[offsets.Count - 2], "Bad offsets detected during write"); metadataInfos.Add(Tuple.Create(metaColumn.Name, codec, (CompressionKind)args[5])); count++; } if (metadataInfos.Count == 0) { _host.CheckIO(writer.BaseStream.Position == offsets[0], "unexpected offset after no writing of metadata"); return(0); } // Write the metadata table of contents just past the end of the last metadata block. // *** Metadata TOC format *** // LEB128 int: Number of metadata TOC entries // Metadata TOC entries: As many of these as indicated by the count above long expectedPosition = offsets[metadataInfos.Count]; writer.WriteLeb128Int((ulong)metadataInfos.Count); expectedPosition += Utils.Leb128IntLength((ulong)metadataInfos.Count); for (int i = 0; i < metadataInfos.Count; ++i) { // *** Metadata TOC entry format *** // string: metadata kind // codec definition: metadata codec // CompressionKind(byte): block compression strategy // long: Offset into the stream of the start of the metadata block // LEB128 int: Byte size of the metadata block in the file writer.Write(metadataInfos[i].Item1); int stringLen = Encoding.UTF8.GetByteCount(metadataInfos[i].Item1); expectedPosition += Utils.Leb128IntLength((ulong)stringLen) + stringLen; _host.CheckIO(writer.BaseStream.Position == expectedPosition, "unexpected offsets after metadata table of contents kind"); expectedPosition += _factory.WriteCodec(writer.BaseStream, metadataInfos[i].Item2); _host.CheckIO(writer.BaseStream.Position == expectedPosition, "unexpected offsets after metadata table of contents type description"); writer.Write((byte)metadataInfos[i].Item3); expectedPosition++; writer.Write(offsets[i]); expectedPosition += sizeof(long); long blockSize = offsets[i + 1] - offsets[i]; writer.WriteLeb128Int((ulong)blockSize); expectedPosition += Utils.Leb128IntLength((ulong)blockSize); _host.CheckIO(writer.BaseStream.Position == expectedPosition, "unexpected offsets after metadata table of contents location"); } _host.Assert(metadataInfos.Count == offsets.Count - 1); return(offsets[metadataInfos.Count]); }
private static void Save(IChannel ch, ModelSaveContext ctx, CodecFactory factory, ref VBuffer <ReadOnlyMemory <char> > values) { Contracts.AssertValue(ch); ch.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(); ctx.SetVersionInfo(GetVersionInfo()); // *** Binary format *** // Codec parameterization: A codec parameterization that should be a ReadOnlyMemory codec // int: n, the number of bytes used to write the values // byte[n]: As encoded using the codec // Get the codec from the factory IValueCodec codec; var result = factory.TryGetCodec(new VectorType(TextType.Instance), out codec); ch.Assert(result); ch.Assert(codec.Type.IsVector); ch.Assert(codec.Type.VectorSize == 0); ch.Assert(codec.Type.ItemType.RawType == typeof(ReadOnlyMemory <char>)); IValueCodec <VBuffer <ReadOnlyMemory <char> > > textCodec = (IValueCodec <VBuffer <ReadOnlyMemory <char> > >)codec; factory.WriteCodec(ctx.Writer.BaseStream, codec); using (var mem = new MemoryStream()) { using (var writer = textCodec.OpenWriter(mem)) { writer.Write(ref values); writer.Commit(); } ctx.Writer.WriteByteArray(mem.ToArray()); } // Make this resemble, more or less, the auxiliary output from the TermTransform. // It will differ somewhat due to the vector being possibly sparse. To distinguish // between missing and empty, empties are not written at all, while missings are. var v = values; char[] buffer = null; ctx.SaveTextStream("Terms.txt", writer => { writer.WriteLine("# Number of terms = {0} of length {1}", v.Count, v.Length); foreach (var pair in v.Items()) { var text = pair.Value; if (text.IsEmpty) { continue; } writer.Write("{0}\t", pair.Key); // REVIEW: What about escaping this, *especially* for linebreaks? // Do C# and .NET really have no equivalent to Python's "repr"? :( if (text.IsEmpty) { writer.WriteLine(); continue; } Utils.EnsureSize(ref buffer, text.Length); var span = text.Span; for (int i = 0; i < text.Length; i++) { buffer[i] = span[i]; } writer.WriteLine(buffer, 0, text.Length); } }); }