private IValueCodec WriteMetadataCore <T>(Stream stream, Schema schema, int col, string kind, ColumnType type, out CompressionKind compressionKind) { _host.Assert(typeof(T) == type.RawType); IValueCodec generalCodec; if (!_factory.TryGetCodec(type, out generalCodec)) { compressionKind = default(CompressionKind); return(null); } IValueCodec <T> codec = (IValueCodec <T>)generalCodec; T value = default(T); schema[col].Metadata.GetValue(kind, ref value); // Metadatas will often be pretty small, so that compression makes no sense. // We try both a compressed and uncompressed version of metadata and // opportunistically pick whichever is smallest. MemoryStream uncompressedMem = _memPool.Get(); using (IValueWriter <T> writer = codec.OpenWriter(uncompressedMem)) { writer.Write(in value); writer.Commit(); } MemoryStream compressedMem = _memPool.Get(); ArraySegment <byte> buffer; bool tmp = uncompressedMem.TryGetBuffer(out buffer); _host.Assert(tmp); using (Stream compressStream = _compression.CompressStream(compressedMem)) compressStream.Write(buffer.Array, buffer.Offset, buffer.Count); if (uncompressedMem.Length <= compressedMem.Length) { // Write uncompressed. compressionKind = CompressionKind.None; } else { // Write compressed. compressionKind = _compression; tmp = compressedMem.TryGetBuffer(out buffer); _host.Assert(tmp); } stream.Write(buffer.Array, buffer.Offset, buffer.Count); _memPool.Return(ref uncompressedMem); _memPool.Return(ref compressedMem); return(codec); }
private void EstimatorCore <T>(RowCursor cursor, ColumnCodec col, out Func <long> fetchWriteEstimator, out IValueWriter writer) { ValueGetter <T> getter = cursor.GetGetter <T>(col.SourceIndex); IValueCodec <T> codec = col.Codec as IValueCodec <T>; _host.AssertValue(codec); IValueWriter <T> specificWriter = codec.OpenWriter(Stream.Null); writer = specificWriter; T val = default(T); fetchWriteEstimator = () => { getter(ref val); specificWriter.Write(in val); return(specificWriter.GetCommitLengthEstimate()); }; }
private static void Save(IChannel ch, ModelSaveContext ctx, CodecFactory factory, ref VBuffer <ReadOnlyMemory <char> > values) { Contracts.AssertValue(ch); ch.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(); ctx.SetVersionInfo(GetVersionInfo()); // *** Binary format *** // Codec parameterization: A codec parameterization that should be a ReadOnlyMemory codec // int: n, the number of bytes used to write the values // byte[n]: As encoded using the codec // Get the codec from the factory IValueCodec codec; var result = factory.TryGetCodec(new VectorType(TextType.Instance), out codec); ch.Assert(result); ch.Assert(codec.Type.IsVector); ch.Assert(codec.Type.VectorSize == 0); ch.Assert(codec.Type.ItemType.RawType == typeof(ReadOnlyMemory <char>)); IValueCodec <VBuffer <ReadOnlyMemory <char> > > textCodec = (IValueCodec <VBuffer <ReadOnlyMemory <char> > >)codec; factory.WriteCodec(ctx.Writer.BaseStream, codec); using (var mem = new MemoryStream()) { using (var writer = textCodec.OpenWriter(mem)) { writer.Write(ref values); writer.Commit(); } ctx.Writer.WriteByteArray(mem.ToArray()); } // Make this resemble, more or less, the auxiliary output from the TermTransform. // It will differ somewhat due to the vector being possibly sparse. To distinguish // between missing and empty, empties are not written at all, while missings are. var v = values; char[] buffer = null; ctx.SaveTextStream("Terms.txt", writer => { writer.WriteLine("# Number of terms = {0} of length {1}", v.Count, v.Length); foreach (var pair in v.Items()) { var text = pair.Value; if (text.IsEmpty) { continue; } writer.Write("{0}\t", pair.Key); // REVIEW: What about escaping this, *especially* for linebreaks? // Do C# and .NET really have no equivalent to Python's "repr"? :( if (text.IsEmpty) { writer.WriteLine(); continue; } Utils.EnsureSize(ref buffer, text.Length); var span = text.Span; for (int i = 0; i < text.Length; i++) { buffer[i] = span[i]; } writer.WriteLine(buffer, 0, text.Length); } }); }