private PageData ReadDataPage(IDataTypeHandler dataTypeHandler, Thrift.PageHeader ph, Thrift.SchemaElement tse, long maxValues) { byte[] data = ReadRawBytes(ph, _inputStream); int max = ph.Data_page_header.Num_values; _footer.GetLevels(_thriftColumnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel); var pd = new PageData(); using (var dataStream = new MemoryStream(data)) { using (var reader = new BinaryReader(dataStream)) { if (maxRepetitionLevel > 0) { pd.repetitions = ReadLevels(reader, maxRepetitionLevel); } if (maxDefinitionLevel > 0) { pd.definitions = ReadLevels(reader, maxDefinitionLevel); } ReadColumn(dataTypeHandler, tse, reader, ph.Data_page_header.Encoding, maxValues, out pd.values, out pd.indexes); } } return(pd); }
public ColumnarReader(Stream inputStream, Thrift.ColumnChunk thriftColumnChunk, ThriftFooter footer, ParquetOptions parquetOptions) { _inputStream = inputStream ?? throw new ArgumentNullException(nameof(inputStream)); _thriftColumnChunk = thriftColumnChunk ?? throw new ArgumentNullException(nameof(thriftColumnChunk)); _footer = footer ?? throw new ArgumentNullException(nameof(footer)); _parquetOptions = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions)); _thriftStream = new ThriftStream(inputStream); _footer.GetLevels(_thriftColumnChunk, out int mrl, out int mdl); _maxRepetitionLevel = mrl; _maxDefinitionLevel = mdl; _thriftSchemaElement = _footer.GetSchemaElement(_thriftColumnChunk); _dataTypeHandler = DataTypeFactory.Match(_thriftSchemaElement, _parquetOptions); }
private Thrift.ColumnChunk WriteColumnChunk(Thrift.SchemaElement tse, List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler) { Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, tse.Type, path, 0); Thrift.PageHeader ph = _footer.CreateDataPage(_rowCount); _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); List <PageTag> pages = WriteColumn(column, tse, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel); chunk.Meta_data.Num_values = ph.Data_page_header.Num_values; //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
public Thrift.ColumnChunk Write(List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler) { Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, _schemaElement.Type, path, 0); Thrift.PageHeader ph = _footer.CreateDataPage(column.TotalCount); _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); List <PageTag> pages = WriteColumn(column, _schemaElement, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel); //this count must be set to number of all values in the column, including nulls. //for hierarchy/repeated columns this is a count of flattened list, including nulls. chunk.Meta_data.Num_values = ph.Data_page_header.Num_values; //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
public ColumnarWriter(Stream output, ThriftStream thriftStream, ThriftFooter footer, Thrift.SchemaElement tse, List <string> path, CompressionMethod compressionMethod, ParquetOptions formatOptions, WriterOptions writerOptions) { _output = output; _thriftStream = thriftStream; _footer = footer; _tse = tse; _compressionMethod = compressionMethod; _formatOptions = formatOptions; _writerOptions = writerOptions; _dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions); _chunk = _footer.CreateColumnChunk(_compressionMethod, _output, _tse.Type, path, 0); _ph = _footer.CreateDataPage(0); _footer.GetLevels(_chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); _maxRepetitionLevel = maxRepetitionLevel; _maxDefinitionLevel = maxDefinitionLevel; }