예제 #1
0
        /// <summary>
        /// A utility method to save a column type and value to a stream, if we have a codec for that.
        /// </summary>
        /// <param name="stream">The stream to write the type and value to</param>
        /// <param name="type">The type of the codec to write and utilize</param>
        /// <param name="value">The value to encode and write</param>
        /// <param name="bytesWritten">The number of bytes written</param>
        /// <returns>Whether the write was successful or not</returns>
        public bool TryWriteTypeAndValue <T>(Stream stream, ColumnType type, ref T value, out int bytesWritten)
        {
            _host.CheckValue(stream, nameof(stream));
            _host.CheckValue(type, nameof(type));
            _host.CheckParam(value.GetType() == type.RawType, nameof(value), "Value doesn't match type");

            IValueCodec codec;

            if (!_factory.TryGetCodec(type, out codec))
            {
                bytesWritten = 0;
                return(false);
            }

            IValueCodec <T> codecT = (IValueCodec <T>)codec;

            bytesWritten = _factory.WriteCodec(stream, codec);

            using (var writer = codecT.OpenWriter(stream))
            {
                writer.Write(in value);
                bytesWritten += (int)writer.GetCommitLengthEstimate();
                writer.Commit();
            }
            return(true);
        }
예제 #2
0
        /// <summary>
        /// Given a codec, write a type description to a stream, from which this codec can be
        /// reconstructed later. This returns the number of bytes written, so that, if this
        /// were a seekable stream, the positions would differ by this amount before and after
        /// a call to this method.
        /// </summary>
        public int WriteCodec(Stream definitionStream, IValueCodec codec)
        {
            // *** Codec type description ***
            // string: codec loadname
            // LEB128 int: Byte size of the parameterization
            // byte[]: The indicated parameterization

            using (BinaryWriter writer = OpenBinaryWriter(definitionStream))
            {
                string loadName = codec.LoadName;
                writer.Write(loadName);
                int bytes = _encoding.GetByteCount(loadName);
                bytes = checked (bytes + Utils.Leb128IntLength((uint)bytes));
                MemoryStream mem    = _memPool.Get();
                int          output = codec.WriteParameterization(mem);
                Contracts.Check(mem.Length == output, "codec description length did not match stream length");
                Contracts.Check(mem.Length <= int.MaxValue); // Is this even possible in the current implementation of MemoryStream?
                writer.WriteLeb128Int((ulong)mem.Length);
                bytes        = checked (bytes + Utils.Leb128IntLength((uint)mem.Length) + output);
                mem.Position = 0;
                mem.CopyTo(definitionStream);
                _memPool.Return(ref mem);
                return(bytes);
            }
        }
예제 #3
0
        /// <summary>
        /// Attempts to define a codec, given a stream positioned at the start of a serialized
        /// codec type definition.
        /// </summary>
        /// <param name="definitionStream">The input stream, which whether this returns true or false
        /// will be left at the end of the codec type definition</param>
        /// <param name="codec">A codec castable to a generic <c>IValueCodec{T}</c> where
        /// <c>typeof(T)==codec.Type.RawType</c></param>
        /// <returns>Whether the codec type definition was understood. If true the codec has defined
        /// value, and should be usable. If false, the name of the codec was unrecognized. Note that
        /// malformed definitions are detected, this will throw instead of returning either true or
        /// false.</returns>
        public bool TryReadCodec(Stream definitionStream, out IValueCodec codec)
        {
            Contracts.AssertValue(definitionStream, "definitionStream");

            using (IChannel ch = _host.Start("TryGetCodec"))
                using (BinaryReader reader = new BinaryReader(definitionStream, Encoding.UTF8, true))
                {
                    string signature = reader.ReadString();
                    Contracts.CheckDecode(!string.IsNullOrEmpty(signature), "Non-empty signature string expected");
                    ulong ulen = reader.ReadLeb128Int();
                    Contracts.CheckDecode(ulen <= long.MaxValue, "Codec type definition read from stream too large");
                    long len = (long)ulen;
                    GetCodecFromStreamDelegate del;
                    if (!_loadNameToCodecCreator.TryGetValue(signature, out del))
                    {
                        codec = default(IValueCodec);
                        if (len == 0)
                        {
                            return(false);
                        }
                        // Move the stream past the end of the definition.
                        if (definitionStream.CanSeek)
                        {
                            long remaining = definitionStream.Length - definitionStream.Position;
                            if (remaining < len)
                            {
                                throw ch.ExceptDecode("Codec type definition supposedly has {0} bytes, but end-of-stream reached after {1} bytes", len, remaining);
                            }
                            definitionStream.Seek(len, SeekOrigin.Current);
                        }
                        else
                        {
                            for (long i = 0; i < len; ++i)
                            {
                                if (definitionStream.ReadByte() == -1)
                                {
                                    throw ch.ExceptDecode("Codec type definition supposedly has {0} bytes, but end-of-stream reached after {1} bytes", len, i);
                                }
                            }
                        }
                        ch.Warning("Did not recognize value codec signature '{0}'", signature);
                        ch.Done();
                        return(false);
                    }
                    // Opportunistically validate in the case of a seekable stream.
                    long pos    = definitionStream.CanSeek ? definitionStream.Position : -1;
                    bool retval = del(definitionStream, out codec);
                    if (definitionStream.CanSeek && definitionStream.Position - pos != len)
                    {
                        throw ch.ExceptDecode("Codec type definition supposedly has {0} bytes, but the handler consumed {1}", len, definitionStream.Position - pos);
                    }
                    ch.Done();
                    return(retval);
                }
        }
예제 #4
0
        /// <summary>
        /// Deserializes and returns a value given a stream and codec.
        /// </summary>
        private object LoadValue <T>(Stream stream, IValueCodec <T> codec)
        {
            _host.Assert(typeof(T) == codec.Type.RawType);
            T value = default(T);

            using (var reader = codec.OpenReader(stream, 1))
            {
                reader.MoveNext();
                reader.Get(ref value);
            }
            return(value);
        }
예제 #5
0
 public bool TryGetCodec(ColumnType type, out IValueCodec codec)
 {
     // Handle the primier types specially.
     if (type.IsKey)
     {
         return(GetKeyCodec(type, out codec));
     }
     if (type.IsVector)
     {
         return(GetVBufferCodec(type, out codec));
     }
     return(_simpleCodecTypeMap.TryGetValue(type.RawKind, out codec));
 }
예제 #6
0
 public bool TryGetCodec(DataViewType type, out IValueCodec codec)
 {
     // Handle the primier types specially.
     if (type is KeyType)
     {
         return(GetKeyCodec(type, out codec));
     }
     if (type is VectorType vectorType)
     {
         return(GetVBufferCodec(vectorType, out codec));
     }
     return(_simpleCodecTypeMap.TryGetValue(type.RawType, out codec));
 }
예제 #7
0
        private IValueCodec WriteMetadataCore <T>(Stream stream, Schema schema, int col, string kind, ColumnType type, out CompressionKind compressionKind)
        {
            _host.Assert(typeof(T) == type.RawType);
            IValueCodec generalCodec;

            if (!_factory.TryGetCodec(type, out generalCodec))
            {
                compressionKind = default(CompressionKind);
                return(null);
            }
            IValueCodec <T> codec = (IValueCodec <T>)generalCodec;
            T value = default(T);

            schema[col].Metadata.GetValue(kind, ref value);

            // Metadatas will often be pretty small, so that compression makes no sense.
            // We try both a compressed and uncompressed version of metadata and
            // opportunistically pick whichever is smallest.
            MemoryStream uncompressedMem = _memPool.Get();

            using (IValueWriter <T> writer = codec.OpenWriter(uncompressedMem))
            {
                writer.Write(in value);
                writer.Commit();
            }
            MemoryStream        compressedMem = _memPool.Get();
            ArraySegment <byte> buffer;
            bool tmp = uncompressedMem.TryGetBuffer(out buffer);

            _host.Assert(tmp);
            using (Stream compressStream = _compression.CompressStream(compressedMem))
                compressStream.Write(buffer.Array, buffer.Offset, buffer.Count);
            if (uncompressedMem.Length <= compressedMem.Length)
            {
                // Write uncompressed.
                compressionKind = CompressionKind.None;
            }
            else
            {
                // Write compressed.
                compressionKind = _compression;
                tmp             = compressedMem.TryGetBuffer(out buffer);
                _host.Assert(tmp);
            }
            stream.Write(buffer.Array, buffer.Offset, buffer.Count);
            _memPool.Return(ref uncompressedMem);
            _memPool.Return(ref compressedMem);
            return(codec);
        }
예제 #8
0
        private void EstimatorCore <T>(RowCursor cursor, ColumnCodec col,
                                       out Func <long> fetchWriteEstimator, out IValueWriter writer)
        {
            ValueGetter <T> getter = cursor.GetGetter <T>(col.SourceIndex);
            IValueCodec <T> codec  = col.Codec as IValueCodec <T>;

            _host.AssertValue(codec);
            IValueWriter <T> specificWriter = codec.OpenWriter(Stream.Null);

            writer = specificWriter;
            T val = default(T);

            fetchWriteEstimator = () =>
            {
                getter(ref val);
                specificWriter.Write(in val);
                return(specificWriter.GetCommitLengthEstimate());
            };
        }
예제 #9
0
 public ColumnCodec(int sourceIndex, IValueCodec codec)
 {
     SourceIndex = sourceIndex;
     Codec       = codec;
 }
예제 #10
0
        /// <summary>
        /// A helper method to query and write metadata to the stream.
        /// </summary>
        /// <param name="writer">A binary writer, which if metadata exists for the
        /// indicated column the base stream will be positioned just past the end of
        /// the written metadata table of contents, and if metadata does not exist
        /// remains unchanged</param>
        /// <param name="schema">The schema to query for metadat</param>
        /// <param name="col">The column we are attempting to get metadata for</param>
        /// <param name="ch">The channel to which we write any diagnostic information</param>
        /// <returns>The offset of the metadata table of contents, or 0 if there was
        /// no metadata</returns>
        private long WriteMetadata(BinaryWriter writer, Schema schema, int col, IChannel ch)
        {
            _host.AssertValue(writer);
            _host.AssertValue(schema);
            _host.Assert(0 <= col && col < schema.Count);

            int count = 0;
            WriteMetadataCoreDelegate del = WriteMetadataCore <int>;
            MethodInfo methInfo           = del.GetMethodInfo().GetGenericMethodDefinition();

            object[] args = new object[] { writer.BaseStream, schema, col, null, null, null };

            List <long> offsets = new List <long>();

            offsets.Add(writer.BaseStream.Position);
            var metadataInfos = new List <Tuple <string, IValueCodec, CompressionKind> >();
            var kinds         = new HashSet <string>();

            // Write all metadata blocks for this column to the file, one after the other, keeping
            // track of the location and size of each for when we write the metadata table of contents.
            // (To be clear, this specific layout is not required by the format.)

            foreach (var metaColumn in schema[col].Metadata.Schema)
            {
                _host.Check(!string.IsNullOrEmpty(metaColumn.Name), "Metadata with null or empty kind detected, disallowed");
                _host.Check(metaColumn.Type != null, "Metadata with null type detected, disallowed");
                if (!kinds.Add(metaColumn.Name))
                {
                    throw _host.Except("Metadata with duplicate kind '{0}' encountered, disallowed", metaColumn.Name, schema[col].Name);
                }
                args[3] = metaColumn.Name;
                args[4] = metaColumn.Type;
                IValueCodec codec = (IValueCodec)methInfo.MakeGenericMethod(metaColumn.Type.RawType).Invoke(this, args);
                if (codec == null)
                {
                    // Nothing was written.
                    ch.Warning("Could not get codec for type {0}, dropping column '{1}' index {2} metadata kind '{3}'",
                               metaColumn.Type, schema[col].Name, col, metaColumn.Name);
                    continue;
                }
                offsets.Add(writer.BaseStream.Position);
                _host.CheckIO(offsets[offsets.Count - 1] > offsets[offsets.Count - 2], "Bad offsets detected during write");
                metadataInfos.Add(Tuple.Create(metaColumn.Name, codec, (CompressionKind)args[5]));
                count++;
            }
            if (metadataInfos.Count == 0)
            {
                _host.CheckIO(writer.BaseStream.Position == offsets[0], "unexpected offset after no writing of metadata");
                return(0);
            }
            // Write the metadata table of contents just past the end of the last metadata block.

            // *** Metadata TOC format ***
            // LEB128 int: Number of metadata TOC entries
            // Metadata TOC entries: As many of these as indicated by the count above

            long expectedPosition = offsets[metadataInfos.Count];

            writer.WriteLeb128Int((ulong)metadataInfos.Count);
            expectedPosition += Utils.Leb128IntLength((ulong)metadataInfos.Count);
            for (int i = 0; i < metadataInfos.Count; ++i)
            {
                // *** Metadata TOC entry format ***
                // string: metadata kind
                // codec definition: metadata codec
                // CompressionKind(byte): block compression strategy
                // long: Offset into the stream of the start of the metadata block
                // LEB128 int: Byte size of the metadata block in the file

                writer.Write(metadataInfos[i].Item1);
                int stringLen = Encoding.UTF8.GetByteCount(metadataInfos[i].Item1);
                expectedPosition += Utils.Leb128IntLength((ulong)stringLen) + stringLen;
                _host.CheckIO(writer.BaseStream.Position == expectedPosition, "unexpected offsets after metadata table of contents kind");

                expectedPosition += _factory.WriteCodec(writer.BaseStream, metadataInfos[i].Item2);
                _host.CheckIO(writer.BaseStream.Position == expectedPosition, "unexpected offsets after metadata table of contents type description");

                writer.Write((byte)metadataInfos[i].Item3);
                expectedPosition++;

                writer.Write(offsets[i]);
                expectedPosition += sizeof(long);

                long blockSize = offsets[i + 1] - offsets[i];
                writer.WriteLeb128Int((ulong)blockSize);
                expectedPosition += Utils.Leb128IntLength((ulong)blockSize);
                _host.CheckIO(writer.BaseStream.Position == expectedPosition, "unexpected offsets after metadata table of contents location");
            }
            _host.Assert(metadataInfos.Count == offsets.Count - 1);
            return(offsets[metadataInfos.Count]);
        }
예제 #11
0
        private static void Save(IChannel ch, ModelSaveContext ctx, CodecFactory factory, ref VBuffer <ReadOnlyMemory <char> > values)
        {
            Contracts.AssertValue(ch);
            ch.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel();
            ctx.SetVersionInfo(GetVersionInfo());

            // *** Binary format ***
            // Codec parameterization: A codec parameterization that should be a ReadOnlyMemory codec
            // int: n, the number of bytes used to write the values
            // byte[n]: As encoded using the codec

            // Get the codec from the factory
            IValueCodec codec;
            var         result = factory.TryGetCodec(new VectorType(TextType.Instance), out codec);

            ch.Assert(result);
            ch.Assert(codec.Type.IsVector);
            ch.Assert(codec.Type.VectorSize == 0);
            ch.Assert(codec.Type.ItemType.RawType == typeof(ReadOnlyMemory <char>));
            IValueCodec <VBuffer <ReadOnlyMemory <char> > > textCodec = (IValueCodec <VBuffer <ReadOnlyMemory <char> > >)codec;

            factory.WriteCodec(ctx.Writer.BaseStream, codec);
            using (var mem = new MemoryStream())
            {
                using (var writer = textCodec.OpenWriter(mem))
                {
                    writer.Write(ref values);
                    writer.Commit();
                }
                ctx.Writer.WriteByteArray(mem.ToArray());
            }

            // Make this resemble, more or less, the auxiliary output from the TermTransform.
            // It will differ somewhat due to the vector being possibly sparse. To distinguish
            // between missing and empty, empties are not written at all, while missings are.
            var v = values;

            char[] buffer = null;
            ctx.SaveTextStream("Terms.txt",
                               writer =>
            {
                writer.WriteLine("# Number of terms = {0} of length {1}", v.Count, v.Length);
                foreach (var pair in v.Items())
                {
                    var text = pair.Value;
                    if (text.IsEmpty)
                    {
                        continue;
                    }
                    writer.Write("{0}\t", pair.Key);
                    // REVIEW: What about escaping this, *especially* for linebreaks?
                    // Do C# and .NET really have no equivalent to Python's "repr"? :(
                    if (text.IsEmpty)
                    {
                        writer.WriteLine();
                        continue;
                    }
                    Utils.EnsureSize(ref buffer, text.Length);

                    var span = text.Span;
                    for (int i = 0; i < text.Length; i++)
                    {
                        buffer[i] = span[i];
                    }

                    writer.WriteLine(buffer, 0, text.Length);
                }
            });
        }