/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> /// <param name="append">When true, appends to the file, otherwise creates a new file.</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false) { PrepareFile(dataSet, append); var stats = new DataSetStats(dataSet); int offset = 0; int count; do { count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset); Thrift.RowGroup rg = _meta.AddRowGroup(); long rgStartPos = _output.Position; rg.Columns = dataSet.Schema.Elements .Select(c => Write(c, dataSet.GetColumn(c.Name, offset, count), compression, stats.GetColumnStats(c))) .ToList(); //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers //luckily ColumnChunk already contains sizes of page+header in it's meta rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size); rg.Num_rows = count; offset += _writerOptions.RowGroupsSize; }while (offset < dataSet.Count); _dataWritten = true; }
/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> /// <param name="append">When true, appends to the file, otherwise creates a new file.</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false) { PrepareFile(dataSet, append); int offset = 0; int count; do { count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset); Thrift.RowGroup rg = _meta.AddRowGroup(); long rgStartPos = Stream.Position; rg.Columns = new List <Thrift.ColumnChunk>(); foreach (SchemaElement se in dataSet.Schema.Flatten()) { var cw = new ColumnWriter(Stream, ThriftStream, _meta, se, compression, _formatOptions, _writerOptions); IList values = dataSet.GetColumn(se, offset, count); Thrift.ColumnChunk chunk = cw.Write(offset, count, values); rg.Columns.Add(chunk); } //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers //luckily ColumnChunk already contains sizes of page+header in it's meta rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size); rg.Num_rows = count; offset += _writerOptions.RowGroupsSize; }while (offset < dataSet.Count); _dataWritten = true; }
internal ParquetRowGroupReader( Thrift.RowGroup rowGroup, ThriftFooter footer, Stream stream, ThriftStream thriftStream, ParquetOptions parquetOptions) { _rowGroup = rowGroup ?? throw new ArgumentNullException(nameof(rowGroup)); _footer = footer ?? throw new ArgumentNullException(nameof(footer)); _stream = stream ?? throw new ArgumentNullException(nameof(stream)); _thriftStream = thriftStream ?? throw new ArgumentNullException(nameof(thriftStream)); _parquetOptions = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions)); //cache chunks foreach (Thrift.ColumnChunk thriftChunk in _rowGroup.Columns) { string path = thriftChunk.GetPath(); _pathToChunk[path] = thriftChunk; } }
internal ParquetRowGroupWriter(Schema schema, Stream stream, ThriftStream thriftStream, ThriftFooter footer, CompressionMethod compressionMethod, ParquetOptions formatOptions) { _schema = schema ?? throw new ArgumentNullException(nameof(schema)); _stream = stream ?? throw new ArgumentNullException(nameof(stream)); _thriftStream = thriftStream ?? throw new ArgumentNullException(nameof(thriftStream)); _footer = footer ?? throw new ArgumentNullException(nameof(footer)); _compressionMethod = compressionMethod; _formatOptions = formatOptions; _thriftRowGroup = _footer.AddRowGroup(); _rgStartPos = _stream.Position; _thriftRowGroup.Columns = new List <Thrift.ColumnChunk>(); _thschema = _footer.GetWriteableSchema(); }
/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip) { _meta.AddSchema(dataSet); var stats = new DataSetStats(dataSet); long totalCount = dataSet.Count; Thrift.RowGroup rg = _meta.AddRowGroup(); long rgStartPos = _output.Position; rg.Columns = dataSet.Schema.Elements .Select(c => Write(c, dataSet.GetColumn(c.Name), compression, stats.GetColumnStats(c))) .ToList(); //row group's size is a sum of _uncompressed_ sizes of all columns in it rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_uncompressed_size); rg.Num_rows = dataSet.Count; _dataWritten = true; }
/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> /// <param name="append">When true, appends to the file, otherwise creates a new file.</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false) { PrepareFile(dataSet, append); _footer.CustomMetadata = dataSet.Metadata.Custom; int offset = 0; int count; List <Thrift.SchemaElement> writeableSchema = _footer.GetWriteableSchema().ToList(); do { count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset); Thrift.RowGroup rg = _footer.AddRowGroup(); long rgStartPos = Stream.Position; rg.Columns = new List <Thrift.ColumnChunk>(); foreach (Thrift.SchemaElement tse in writeableSchema) { List <string> path = _footer.GetPath(tse); string flatPath = string.Join(Schema.PathSeparator, path); var cw = new ColumnarWriter(Stream, ThriftStream, _footer, tse, path, compression, _formatOptions, _writerOptions); IList values = dataSet.GetColumn(flatPath, offset, count); Thrift.ColumnChunk chunk = cw.Write(offset, count, values); rg.Columns.Add(chunk); } //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers //luckily ColumnChunk already contains sizes of page+header in it's meta rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size); rg.Num_rows = count; offset += _writerOptions.RowGroupsSize; }while (offset < dataSet.Count); _dataWritten = true; }
public void Read(TProtocol iprot) { iprot.IncrementRecursionDepth(); try { bool isset_version = false; bool isset_schema = false; bool isset_num_rows = false; bool isset_row_groups = false; TField field; iprot.ReadStructBegin(); while (true) { field = iprot.ReadFieldBegin(); if (field.Type == TType.Stop) { break; } switch (field.ID) { case 1: if (field.Type == TType.I32) { Version = iprot.ReadI32(); isset_version = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 2: if (field.Type == TType.List) { { Schema = new List <SchemaElement>(); TList _list24 = iprot.ReadListBegin(); for (int _i25 = 0; _i25 < _list24.Count; ++_i25) { SchemaElement _elem26; _elem26 = new SchemaElement(); _elem26.Read(iprot); Schema.Add(_elem26); } iprot.ReadListEnd(); } isset_schema = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 3: if (field.Type == TType.I64) { Num_rows = iprot.ReadI64(); isset_num_rows = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 4: if (field.Type == TType.List) { { Row_groups = new List <RowGroup>(); TList _list27 = iprot.ReadListBegin(); for (int _i28 = 0; _i28 < _list27.Count; ++_i28) { RowGroup _elem29; _elem29 = new RowGroup(); _elem29.Read(iprot); Row_groups.Add(_elem29); } iprot.ReadListEnd(); } isset_row_groups = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 5: if (field.Type == TType.List) { { Key_value_metadata = new List <KeyValue>(); TList _list30 = iprot.ReadListBegin(); for (int _i31 = 0; _i31 < _list30.Count; ++_i31) { KeyValue _elem32; _elem32 = new KeyValue(); _elem32.Read(iprot); Key_value_metadata.Add(_elem32); } iprot.ReadListEnd(); } } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 6: if (field.Type == TType.String) { Created_by = iprot.ReadString(); } else { TProtocolUtil.Skip(iprot, field.Type); } break; default: TProtocolUtil.Skip(iprot, field.Type); break; } iprot.ReadFieldEnd(); } iprot.ReadStructEnd(); if (!isset_version) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_schema) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_num_rows) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_row_groups) { throw new TProtocolException(TProtocolException.INVALID_DATA); } } finally { iprot.DecrementRecursionDepth(); } }
public void Read(TProtocol iprot) { iprot.IncrementRecursionDepth(); try { bool isset_version = false; bool isset_schema = false; bool isset_num_rows = false; bool isset_row_groups = false; TField field; iprot.ReadStructBegin(); while (true) { field = iprot.ReadFieldBegin(); if (field.Type == TType.Stop) { break; } switch (field.ID) { case 1: if (field.Type == TType.I32) { Version = iprot.ReadI32(); isset_version = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 2: if (field.Type == TType.List) { { Schema = new List <SchemaElement>(); TList _list48 = iprot.ReadListBegin(); for (int _i49 = 0; _i49 < _list48.Count; ++_i49) { SchemaElement _elem50; _elem50 = new SchemaElement(); _elem50.Read(iprot); Schema.Add(_elem50); } iprot.ReadListEnd(); } isset_schema = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 3: if (field.Type == TType.I64) { Num_rows = iprot.ReadI64(); isset_num_rows = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 4: if (field.Type == TType.List) { { Row_groups = new List <RowGroup>(); TList _list51 = iprot.ReadListBegin(); for (int _i52 = 0; _i52 < _list51.Count; ++_i52) { RowGroup _elem53; _elem53 = new RowGroup(); _elem53.Read(iprot); Row_groups.Add(_elem53); } iprot.ReadListEnd(); } isset_row_groups = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 5: if (field.Type == TType.List) { { Key_value_metadata = new List <KeyValue>(); TList _list54 = iprot.ReadListBegin(); for (int _i55 = 0; _i55 < _list54.Count; ++_i55) { KeyValue _elem56; _elem56 = new KeyValue(); _elem56.Read(iprot); Key_value_metadata.Add(_elem56); } iprot.ReadListEnd(); } } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 6: if (field.Type == TType.String) { Created_by = iprot.ReadString(); } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 7: if (field.Type == TType.List) { { Column_orders = new List <ColumnOrder>(); TList _list57 = iprot.ReadListBegin(); for (int _i58 = 0; _i58 < _list57.Count; ++_i58) { ColumnOrder _elem59; _elem59 = new ColumnOrder(); _elem59.Read(iprot); Column_orders.Add(_elem59); } iprot.ReadListEnd(); } } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 8: if (field.Type == TType.Struct) { Encryption_algorithm = new EncryptionAlgorithm(); Encryption_algorithm.Read(iprot); } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 9: if (field.Type == TType.String) { Footer_signing_key_metadata = iprot.ReadBinary(); } else { TProtocolUtil.Skip(iprot, field.Type); } break; default: TProtocolUtil.Skip(iprot, field.Type); break; } iprot.ReadFieldEnd(); } iprot.ReadStructEnd(); if (!isset_version) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_schema) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_num_rows) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_row_groups) { throw new TProtocolException(TProtocolException.INVALID_DATA); } } finally { iprot.DecrementRecursionDepth(); } }
public async Task ReadAsync(TProtocol iprot, CancellationToken cancellationToken) { iprot.IncrementRecursionDepth(); try { bool isset_version = false; bool isset_schema = false; bool isset_num_rows = false; bool isset_row_groups = false; TField field; await iprot.ReadStructBeginAsync(cancellationToken); while (true) { field = await iprot.ReadFieldBeginAsync(cancellationToken); if (field.Type == TType.Stop) { break; } switch (field.ID) { case 1: if (field.Type == TType.I32) { Version = await iprot.ReadI32Async(cancellationToken); isset_version = true; } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; case 2: if (field.Type == TType.List) { { Schema = new List <SchemaElement>(); TList _list44 = await iprot.ReadListBeginAsync(cancellationToken); for (int _i45 = 0; _i45 < _list44.Count; ++_i45) { SchemaElement _elem46; _elem46 = new SchemaElement(); await _elem46.ReadAsync(iprot, cancellationToken); Schema.Add(_elem46); } await iprot.ReadListEndAsync(cancellationToken); } isset_schema = true; } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; case 3: if (field.Type == TType.I64) { Num_rows = await iprot.ReadI64Async(cancellationToken); isset_num_rows = true; } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; case 4: if (field.Type == TType.List) { { Row_groups = new List <RowGroup>(); TList _list47 = await iprot.ReadListBeginAsync(cancellationToken); for (int _i48 = 0; _i48 < _list47.Count; ++_i48) { RowGroup _elem49; _elem49 = new RowGroup(); await _elem49.ReadAsync(iprot, cancellationToken); Row_groups.Add(_elem49); } await iprot.ReadListEndAsync(cancellationToken); } isset_row_groups = true; } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; case 5: if (field.Type == TType.List) { { Key_value_metadata = new List <KeyValue>(); TList _list50 = await iprot.ReadListBeginAsync(cancellationToken); for (int _i51 = 0; _i51 < _list50.Count; ++_i51) { KeyValue _elem52; _elem52 = new KeyValue(); await _elem52.ReadAsync(iprot, cancellationToken); Key_value_metadata.Add(_elem52); } await iprot.ReadListEndAsync(cancellationToken); } } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; case 6: if (field.Type == TType.String) { Created_by = await iprot.ReadStringAsync(cancellationToken); } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; case 7: if (field.Type == TType.List) { { Column_orders = new List <ColumnOrder>(); TList _list53 = await iprot.ReadListBeginAsync(cancellationToken); for (int _i54 = 0; _i54 < _list53.Count; ++_i54) { ColumnOrder _elem55; _elem55 = new ColumnOrder(); await _elem55.ReadAsync(iprot, cancellationToken); Column_orders.Add(_elem55); } await iprot.ReadListEndAsync(cancellationToken); } } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; default: await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); break; } await iprot.ReadFieldEndAsync(cancellationToken); } await iprot.ReadStructEndAsync(cancellationToken); if (!isset_version) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_schema) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_num_rows) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_row_groups) { throw new TProtocolException(TProtocolException.INVALID_DATA); } } finally { iprot.DecrementRecursionDepth(); } }