Пример #1
0
 /// <summary>
 /// Gets the dataview corresponding to this sub-IDV entry. This will
 /// lazily load the file, if it has not previously been requested. This
 /// will return <c>null</c> if the offset is 0.
 /// </summary>
 public IDataView GetViewOrNull()
 {
     if (_view == null && _offset > 0)
     {
         Stream stream = _parent._file.Open(0);
         stream.Seek(_offset, SeekOrigin.Begin);
         Contracts.Check(stream.Position == _offset, "Unexpected position on substream");
         SubsetStream ss      = new SubsetStream(stream, _length);
         var          binArgs = new BinaryLoader.Arguments();
         if (_parent._threads > 0)
         {
             binArgs.Threads = _parent._threads;
         }
         BinaryLoader loader = new BinaryLoader(Host,
                                                binArgs, ss, leaveOpen: false);
         var view = Interlocked.CompareExchange(ref _view, loader, null);
         // If multiple threads have called this as it was being loaded,
         // have ensure that this check only happens once.
         if (view == loader)
         {
             VerifyView(view);
         }
     }
     return(_view);
 }
Пример #2
0
        private static void Load(IChannel ch, ModelLoadContext ctx, CodecFactory factory, ref VBuffer <ReadOnlyMemory <char> > values)
        {
            Contracts.AssertValue(ch);
            ch.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel(GetVersionInfo());

            // *** Binary format ***
            // Codec parameterization: A codec parameterization that should be a ReadOnlyMemory codec
            // int: n, the number of bytes used to write the values
            // byte[n]: As encoded using the codec

            // Get the codec from the factory, and from the stream. We have to
            // attempt to read the codec from the stream, since codecs can potentially
            // be versioned based on their parameterization.
            IValueCodec codec;

            // This *could* happen if we have an old version attempt to read a new version.
            // Enabling this sort of binary classification is why we also need to write the
            // codec specification.
            if (!factory.TryReadCodec(ctx.Reader.BaseStream, out codec))
            {
                throw ch.ExceptDecode();
            }
            ch.AssertValue(codec);
            ch.CheckDecode(codec.Type.IsVector);
            ch.CheckDecode(codec.Type.ItemType.IsText);
            var textCodec = (IValueCodec <VBuffer <ReadOnlyMemory <char> > >)codec;

            var bufferLen = ctx.Reader.ReadInt32();

            ch.CheckDecode(bufferLen >= 0);
            using (var stream = new SubsetStream(ctx.Reader.BaseStream, bufferLen))
            {
                using (var reader = textCodec.OpenReader(stream, 1))
                {
                    reader.MoveNext();
                    values = default(VBuffer <ReadOnlyMemory <char> >);
                    reader.Get(ref values);
                }
                ch.CheckDecode(stream.ReadByte() == -1);
            }
        }
Пример #3
0
        private void RunCore(IChannel ch)
        {
            Host.AssertValue(ch);
            IDataView data = CreateAndSaveLoader();

            if (!string.IsNullOrWhiteSpace(Args.Columns))
            {
                var args = new ChooseColumnsTransform.Arguments();
                args.Column = Args.Columns
                              .Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries).Select(s => new ChooseColumnsTransform.Column()
                {
                    Name = s
                }).ToArray();
                if (Utils.Size(args.Column) > 0)
                {
                    data = new ChooseColumnsTransform(Host, args, data);
                }
            }

            IDataSaver saver;

            if (Args.Saver != null)
            {
                saver = Args.Saver.CreateComponent(Host);
            }
            else
            {
                saver = new TextSaver(Host, new TextSaver.Arguments()
                {
                    Dense = Args.Dense
                });
            }
            var cols = new List <int>();

            for (int i = 0; i < data.Schema.ColumnCount; i++)
            {
                if (!Args.KeepHidden && data.Schema.IsHidden(i))
                {
                    continue;
                }
                var type = data.Schema.GetColumnType(i);
                if (saver.IsColumnSavable(type))
                {
                    cols.Add(i);
                }
                else
                {
                    ch.Info(MessageSensitivity.Schema, "The column '{0}' will not be written as it has unsavable column type.", data.Schema.GetColumnName(i));
                }
            }
            Host.NotSensitive().Check(cols.Count > 0, "No valid columns to save");

            // Send the first N lines to console.
            if (Args.Rows > 0)
            {
                var args = new SkipTakeFilter.TakeArguments()
                {
                    Count = Args.Rows
                };
                data = SkipTakeFilter.Create(Host, args, data);
            }
            var textSaver = saver as TextSaver;

            // If it is a text saver, utilize a special utility for this purpose.
            if (textSaver != null)
            {
                textSaver.WriteData(data, true, cols.ToArray());
            }
            else
            {
                using (MemoryStream mem = new MemoryStream())
                {
                    using (Stream wrapStream = new SubsetStream(mem))
                        saver.SaveData(wrapStream, data, cols.ToArray());
                    mem.Seek(0, SeekOrigin.Begin);
                    using (StreamReader reader = new StreamReader(mem))
                    {
                        string result = reader.ReadToEnd();
                        ch.Info(MessageSensitivity.UserData | MessageSensitivity.Schema, result);
                    }
                }
            }
        }
Пример #4
0
        private void SaveTransposedData(IChannel ch, Stream stream, ITransposeDataView data, int[] cols)
        {
            _host.AssertValue(ch);
            ch.AssertValue(stream);
            ch.AssertValue(data);
            ch.AssertNonEmpty(cols);
            ch.Assert(stream.CanSeek);

            // Initialize what we can in the header, though we will not be writing out things in the
            // header until we have confidence that things were written out correctly.
            TransposeLoader.Header header = default(TransposeLoader.Header);
            header.Signature         = TransposeLoader.Header.SignatureValue;
            header.Version           = TransposeLoader.Header.WriterVersion;
            header.CompatibleVersion = TransposeLoader.Header.WriterVersion;
            VectorType slotType = data.TransposeSchema.GetSlotType(cols[0]);

            ch.AssertValue(slotType);
            header.RowCount    = slotType.ValueCount;
            header.ColumnCount = cols.Length;

            // We keep track of the offsets of the start of each sub-IDV, for use in writing out the
            // offsets/length table later.
            List <long> offsets = new List <long>();

            // First write a bunch of zeros at the head, as a placeholder for the header that
            // will go there assuming we can successfully load it. We'll keep this array around
            // for the real marshalling and writing of the header bytes structure.
            byte[] headerBytes = new byte[TransposeLoader.Header.HeaderSize];
            stream.Write(headerBytes, 0, headerBytes.Length);
            offsets.Add(stream.Position);

            // This is a convenient delegate to write out an IDV substream, then save the offsets
            // where writing stopped to the offsets list.
            Action <string, IDataView> viewAction =
                (name, view) =>
            {
                using (var substream = new SubsetStream(stream))
                {
                    _internalSaver.SaveData(substream, view, Utils.GetIdentityPermutation(view.Schema.ColumnCount));
                    substream.Seek(0, SeekOrigin.End);
                    ch.Info("Wrote {0} data view in {1} bytes", name, substream.Length);
                }
                offsets.Add(stream.Position);
            };

            // First write out the no-row data, limited to these columns.
            IDataView subdata = new ChooseColumnsByIndexTransform(_host,
                                                                  new ChooseColumnsByIndexTransform.Arguments()
            {
                Index = cols
            }, data);

            // If we want the "dual mode" row-wise and slot-wise file, don't filter out anything.
            if (!_writeRowData)
            {
                subdata = SkipTakeFilter.Create(_host, new SkipTakeFilter.TakeArguments()
                {
                    Count = 0
                }, subdata);
            }

            string msg = _writeRowData ? "row-wise data, schema, and metadata" : "schema and metadata";

            viewAction(msg, subdata);
            foreach (var col in cols)
            {
                viewAction(data.Schema.GetColumnName(col), new TransposerUtils.SlotDataView(_host, data, col));
            }

            // Wrote out the dataview. Write out the table offset.
            using (var writer = new BinaryWriter(stream, Encoding.UTF8, leaveOpen: true))
            {
                // Format of the table is offset, length, both as 8-byte integers.
                // As it happens we wrote things out as adjacent sub-IDVs, so the
                // length can be derived from the offsets. The first will be the
                // start of the first sub-IDV, and all subsequent entries will be
                // the start/end of the current/next sub-IDV, respectively, so a total
                // of cols.Length + 2 entries.
                ch.Assert(offsets.Count == cols.Length + 2);
                ch.Assert(offsets[offsets.Count - 1] == stream.Position);
                header.SubIdvTableOffset = stream.Position;
                for (int c = 1; c < offsets.Count; ++c)
                {
                    // 8-byte int for offsets, 8-byte int for length.
                    writer.Write(offsets[c - 1]);
                    writer.Write(offsets[c] - offsets[c - 1]);
                }
                header.TailOffset = stream.Position;
                writer.Write(TransposeLoader.Header.TailSignatureValue);

                // Now we are confident that things will work, so write it out.
                unsafe
                {
                    Marshal.Copy(new IntPtr(&header), headerBytes, 0, Marshal.SizeOf(typeof(Header)));
                }
                writer.Seek(0, SeekOrigin.Begin);
                writer.Write(headerBytes);
            }
        }