/// <summary> /// Gets the dataview corresponding to this sub-IDV entry. This will /// lazily load the file, if it has not previously been requested. This /// will return <c>null</c> if the offset is 0. /// </summary> public IDataView GetViewOrNull() { if (_view == null && _offset > 0) { Stream stream = _parent._file.Open(0); stream.Seek(_offset, SeekOrigin.Begin); Contracts.Check(stream.Position == _offset, "Unexpected position on substream"); SubsetStream ss = new SubsetStream(stream, _length); var binArgs = new BinaryLoader.Arguments(); if (_parent._threads > 0) { binArgs.Threads = _parent._threads; } BinaryLoader loader = new BinaryLoader(Host, binArgs, ss, leaveOpen: false); var view = Interlocked.CompareExchange(ref _view, loader, null); // If multiple threads have called this as it was being loaded, // have ensure that this check only happens once. if (view == loader) { VerifyView(view); } } return(_view); }
private static void Load(IChannel ch, ModelLoadContext ctx, CodecFactory factory, ref VBuffer <ReadOnlyMemory <char> > values) { Contracts.AssertValue(ch); ch.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(GetVersionInfo()); // *** Binary format *** // Codec parameterization: A codec parameterization that should be a ReadOnlyMemory codec // int: n, the number of bytes used to write the values // byte[n]: As encoded using the codec // Get the codec from the factory, and from the stream. We have to // attempt to read the codec from the stream, since codecs can potentially // be versioned based on their parameterization. IValueCodec codec; // This *could* happen if we have an old version attempt to read a new version. // Enabling this sort of binary classification is why we also need to write the // codec specification. if (!factory.TryReadCodec(ctx.Reader.BaseStream, out codec)) { throw ch.ExceptDecode(); } ch.AssertValue(codec); ch.CheckDecode(codec.Type.IsVector); ch.CheckDecode(codec.Type.ItemType.IsText); var textCodec = (IValueCodec <VBuffer <ReadOnlyMemory <char> > >)codec; var bufferLen = ctx.Reader.ReadInt32(); ch.CheckDecode(bufferLen >= 0); using (var stream = new SubsetStream(ctx.Reader.BaseStream, bufferLen)) { using (var reader = textCodec.OpenReader(stream, 1)) { reader.MoveNext(); values = default(VBuffer <ReadOnlyMemory <char> >); reader.Get(ref values); } ch.CheckDecode(stream.ReadByte() == -1); } }
private void RunCore(IChannel ch) { Host.AssertValue(ch); IDataView data = CreateAndSaveLoader(); if (!string.IsNullOrWhiteSpace(Args.Columns)) { var args = new ChooseColumnsTransform.Arguments(); args.Column = Args.Columns .Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries).Select(s => new ChooseColumnsTransform.Column() { Name = s }).ToArray(); if (Utils.Size(args.Column) > 0) { data = new ChooseColumnsTransform(Host, args, data); } } IDataSaver saver; if (Args.Saver != null) { saver = Args.Saver.CreateComponent(Host); } else { saver = new TextSaver(Host, new TextSaver.Arguments() { Dense = Args.Dense }); } var cols = new List <int>(); for (int i = 0; i < data.Schema.ColumnCount; i++) { if (!Args.KeepHidden && data.Schema.IsHidden(i)) { continue; } var type = data.Schema.GetColumnType(i); if (saver.IsColumnSavable(type)) { cols.Add(i); } else { ch.Info(MessageSensitivity.Schema, "The column '{0}' will not be written as it has unsavable column type.", data.Schema.GetColumnName(i)); } } Host.NotSensitive().Check(cols.Count > 0, "No valid columns to save"); // Send the first N lines to console. if (Args.Rows > 0) { var args = new SkipTakeFilter.TakeArguments() { Count = Args.Rows }; data = SkipTakeFilter.Create(Host, args, data); } var textSaver = saver as TextSaver; // If it is a text saver, utilize a special utility for this purpose. if (textSaver != null) { textSaver.WriteData(data, true, cols.ToArray()); } else { using (MemoryStream mem = new MemoryStream()) { using (Stream wrapStream = new SubsetStream(mem)) saver.SaveData(wrapStream, data, cols.ToArray()); mem.Seek(0, SeekOrigin.Begin); using (StreamReader reader = new StreamReader(mem)) { string result = reader.ReadToEnd(); ch.Info(MessageSensitivity.UserData | MessageSensitivity.Schema, result); } } } }
private void SaveTransposedData(IChannel ch, Stream stream, ITransposeDataView data, int[] cols) { _host.AssertValue(ch); ch.AssertValue(stream); ch.AssertValue(data); ch.AssertNonEmpty(cols); ch.Assert(stream.CanSeek); // Initialize what we can in the header, though we will not be writing out things in the // header until we have confidence that things were written out correctly. TransposeLoader.Header header = default(TransposeLoader.Header); header.Signature = TransposeLoader.Header.SignatureValue; header.Version = TransposeLoader.Header.WriterVersion; header.CompatibleVersion = TransposeLoader.Header.WriterVersion; VectorType slotType = data.TransposeSchema.GetSlotType(cols[0]); ch.AssertValue(slotType); header.RowCount = slotType.ValueCount; header.ColumnCount = cols.Length; // We keep track of the offsets of the start of each sub-IDV, for use in writing out the // offsets/length table later. List <long> offsets = new List <long>(); // First write a bunch of zeros at the head, as a placeholder for the header that // will go there assuming we can successfully load it. We'll keep this array around // for the real marshalling and writing of the header bytes structure. byte[] headerBytes = new byte[TransposeLoader.Header.HeaderSize]; stream.Write(headerBytes, 0, headerBytes.Length); offsets.Add(stream.Position); // This is a convenient delegate to write out an IDV substream, then save the offsets // where writing stopped to the offsets list. Action <string, IDataView> viewAction = (name, view) => { using (var substream = new SubsetStream(stream)) { _internalSaver.SaveData(substream, view, Utils.GetIdentityPermutation(view.Schema.ColumnCount)); substream.Seek(0, SeekOrigin.End); ch.Info("Wrote {0} data view in {1} bytes", name, substream.Length); } offsets.Add(stream.Position); }; // First write out the no-row data, limited to these columns. IDataView subdata = new ChooseColumnsByIndexTransform(_host, new ChooseColumnsByIndexTransform.Arguments() { Index = cols }, data); // If we want the "dual mode" row-wise and slot-wise file, don't filter out anything. if (!_writeRowData) { subdata = SkipTakeFilter.Create(_host, new SkipTakeFilter.TakeArguments() { Count = 0 }, subdata); } string msg = _writeRowData ? "row-wise data, schema, and metadata" : "schema and metadata"; viewAction(msg, subdata); foreach (var col in cols) { viewAction(data.Schema.GetColumnName(col), new TransposerUtils.SlotDataView(_host, data, col)); } // Wrote out the dataview. Write out the table offset. using (var writer = new BinaryWriter(stream, Encoding.UTF8, leaveOpen: true)) { // Format of the table is offset, length, both as 8-byte integers. // As it happens we wrote things out as adjacent sub-IDVs, so the // length can be derived from the offsets. The first will be the // start of the first sub-IDV, and all subsequent entries will be // the start/end of the current/next sub-IDV, respectively, so a total // of cols.Length + 2 entries. ch.Assert(offsets.Count == cols.Length + 2); ch.Assert(offsets[offsets.Count - 1] == stream.Position); header.SubIdvTableOffset = stream.Position; for (int c = 1; c < offsets.Count; ++c) { // 8-byte int for offsets, 8-byte int for length. writer.Write(offsets[c - 1]); writer.Write(offsets[c] - offsets[c - 1]); } header.TailOffset = stream.Position; writer.Write(TransposeLoader.Header.TailSignatureValue); // Now we are confident that things will work, so write it out. unsafe { Marshal.Copy(new IntPtr(&header), headerBytes, 0, Marshal.SizeOf(typeof(Header))); } writer.Seek(0, SeekOrigin.Begin); writer.Write(headerBytes); } }