private List <PageTag> WriteColumn(DataColumn column, Thrift.SchemaElement tse, IDataTypeHandler dataTypeHandler, int maxRepetitionLevel, int maxDefinitionLevel) { var pages = new List <PageTag>(); /* * Page header must preceeed actual data (compressed or not) however it contains both * the uncompressed and compressed data size which we don't know! This somehow limits * the write efficiency. */ using (var ms = new MemoryStream()) { Thrift.PageHeader dataPageHeader = _footer.CreateDataPage(column.TotalCount); //chain streams together so we have real streaming instead of wasting undefraggable LOH memory using (GapStream pageStream = DataStreamFactory.CreateWriter(ms, _compressionMethod, true)) { using (var writer = new BinaryWriter(pageStream, Encoding.UTF8, true)) { if (maxRepetitionLevel > 0) { WriteLevels(writer, column.RepetitionLevels, maxRepetitionLevel); } if (maxDefinitionLevel > 0) { WriteLevels(writer, column.DefinitionLevels, maxDefinitionLevel); } dataTypeHandler.Write(tse, writer, column.DefinedData); writer.Flush(); } pageStream.Flush(); //extremely important to flush the stream as some compression algorithms don't finish writing dataPageHeader.Uncompressed_page_size = (int)pageStream.Position; } dataPageHeader.Compressed_page_size = (int)ms.Position; //write the header in int headerSize = _thriftStream.Write(dataPageHeader); ms.Position = 0; ms.CopyTo(_stream); var dataTag = new PageTag { HeaderMeta = dataPageHeader, HeaderSize = headerSize }; pages.Add(dataTag); } return(pages); }
private PageData ReadDataPage(Thrift.PageHeader ph, long maxValues) { int max = ph.Data_page_header.Num_values; var pd = new PageData(); using (Stream pageStream = OpenDataPageStream(ph)) { using (var reader = new BinaryReader(pageStream)) { if (_maxRepetitionLevel > 0) { pd.repetitions = ReadLevels(reader, _maxRepetitionLevel, max); } if (_maxDefinitionLevel > 0) { pd.definitions = ReadLevels(reader, _maxDefinitionLevel, max); } ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, out pd.values, out pd.indexes); } } return(pd); }
private byte[] Compress(Thrift.PageHeader ph, byte[] data, CompressionMethod compression) { //note that page size numbers do not include header size by spec ph.Uncompressed_page_size = data.Length; byte[] result; if (compression != CompressionMethod.None) { IDataWriter writer = DataFactory.GetWriter(compression); using (var ms = new MemoryStream()) { writer.Write(data, ms); result = ms.ToArray(); } ph.Compressed_page_size = result.Length; } else { ph.Compressed_page_size = ph.Uncompressed_page_size; result = data; } return(result); }
private PageData ReadDataPage(Thrift.PageHeader ph, IList destination, long maxValues) { byte[] data = ReadRawBytes(ph, _inputStream); int max = ph.Data_page_header.Num_values; using (var dataStream = new MemoryStream(data)) { using (var reader = new BinaryReader(dataStream)) { List <int> repetitions = _schema.HasRepetitionLevelsPage ? ReadRepetitionLevels(reader) : null; List <int> definitions = _schema.HasDefinitionLevelsPage ? ReadDefinitionLevels(reader) : null; // these are pointers back to the Values table - lookup on values List <int> indexes = ReadColumnValues(reader, ph.Data_page_header.Encoding, destination, max); return(new PageData { definitions = definitions, repetitions = repetitions, indexes = indexes }); } } }
public IList Read(long offset, long count) { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); IList dictionary = null; List <int> indexes = null; List <int> repetitions = null; List <int> definitions = null; IList values = null; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out dictionary)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count); PageData pd = ReadDataPage(ph, maxValues - valuesSoFar); repetitions = AssignOrAdd(repetitions, pd.repetitions); definitions = AssignOrAdd(definitions, pd.definitions); indexes = AssignOrAdd(indexes, pd.indexes); values = AssignOrAdd(values, pd.values); pagesRead++; int totalCount = Math.Max( (values == null ? 0 : values.Count) + (indexes == null ? 0 : indexes.Count), (definitions == null ? 0 : definitions.Count)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } IList mergedValues = new ValueMerger( _maxDefinitionLevel, _maxRepetitionLevel, () => _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0), values ?? _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0)) .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues); mergedValues.Trim((int)offset, (int)count); return(mergedValues); }
private int Write(Thrift.PageHeader ph, byte[] data) { int headerSize = _thriftStream.Write(ph); _output.Write(data, 0, data.Length); return(headerSize); }
private bool TryReadDictionaryPage(Thrift.PageHeader ph, out Array dictionary, out int dictionaryOffset) { if (ph.Type != Thrift.PageType.DICTIONARY_PAGE) { dictionary = null; dictionaryOffset = 0; return(false); } //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding. using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = new MemoryStream(bytes.Memory.ToArray())) { using (var dataReader = new BinaryReader(ms)) { dictionary = _dataTypeHandler.GetArray(ph.Dictionary_page_header.Num_values, false, false); dictionaryOffset = _dataTypeHandler.Read(dataReader, _thriftSchemaElement, dictionary, 0); return(true); } } } }
PageData ReadDataPage(Thrift.PageHeader ph, long maxValues) { byte[] data = ReadRawBytes(ph, _inputStream); int max = ph.Data_page_header.Num_values; var pd = new PageData(); using (var dataStream = new MemoryStream(data)) { using (var reader = new BinaryReader(dataStream)) { if (_maxRepetitionLevel > 0) { pd.repetitions = ReadLevels(reader, _maxRepetitionLevel, max); } if (_maxDefinitionLevel > 0) { pd.definitions = ReadLevels(reader, _maxDefinitionLevel, max); } ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, out pd.values, out pd.indexes); } } return(pd); }
private PageData ReadDataPage(IDataTypeHandler dataTypeHandler, Thrift.PageHeader ph, Thrift.SchemaElement tse, long maxValues) { byte[] data = ReadRawBytes(ph, _inputStream); int max = ph.Data_page_header.Num_values; _footer.GetLevels(_thriftColumnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel); var pd = new PageData(); using (var dataStream = new MemoryStream(data)) { using (var reader = new BinaryReader(dataStream)) { if (maxRepetitionLevel > 0) { pd.repetitions = ReadLevels(reader, maxRepetitionLevel); } if (maxDefinitionLevel > 0) { pd.definitions = ReadLevels(reader, maxDefinitionLevel); } ReadColumn(dataTypeHandler, tse, reader, ph.Data_page_header.Encoding, maxValues, out pd.values, out pd.indexes); } } return(pd); }
private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (Stream pageStream = OpenDataPageStream(ph)) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), pageStream.Length); using (var reader = new BinaryReader(pageStream)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset); } ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ref cd.values, ref cd.valuesOffset, ref cd.indexes, ref cd.indexesOffset); } } }
private byte[] ReadRawBytes(Thrift.PageHeader ph, Stream inputStream) { Thrift.CompressionCodec thriftCodec = _thriftChunk.Meta_data.Codec; IDataReader reader = DataFactory.GetReader(thriftCodec); return(reader.Read(inputStream, ph.Compressed_page_size)); }
private Stream OpenDataPageStream(Thrift.PageHeader pageHeader) { var window = new WindowedStream(_inputStream, pageHeader.Compressed_page_size); Stream uncompressed = DataStreamFactory.CreateReader(window, _thriftColumnChunk.Meta_data.Codec, pageHeader.Uncompressed_page_size); return(uncompressed); }
private List <PageTag> WriteColumn(DataColumn column, Thrift.SchemaElement tse, IDataTypeHandler dataTypeHandler, int maxRepetitionLevel, int maxDefinitionLevel) { var pages = new List <PageTag>(); /* * Page header must preceeed actual data (compressed or not) however it contains both * the uncompressed and compressed data size which we don't know! This somehow limits * the write efficiency. */ using (var ms = new MemoryStream()) { Thrift.PageHeader dataPageHeader = _footer.CreateDataPage(column.TotalCount); //chain streams together so we have real streaming instead of wasting undefraggable LOH memory using (PositionTrackingStream pps = DataStreamFactory.CreateWriter(ms, _compressionMethod)) { using (var writer = new BinaryWriter(pps)) { if (column.HasRepetitions) { throw new NotImplementedException(); } if (column.HasDefinitions) { WriteLevels(writer, column.DefinitionLevels, maxDefinitionLevel); } dataTypeHandler.Write(tse, writer, column.DefinedData); } dataPageHeader.Uncompressed_page_size = (int)pps.Position; } dataPageHeader.Compressed_page_size = (int)ms.Position; //write the hader in int headerSize = _thriftStream.Write(dataPageHeader); ms.Position = 0; ms.CopyTo(_stream); var dataTag = new PageTag { HeaderMeta = dataPageHeader, HeaderSize = headerSize }; pages.Add(dataTag); } return(pages); }
public void Read(long offset, long count) { Thrift.SchemaElement tse = _footer.GetSchemaElement(_thriftColumnChunk); IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _parquetOptions); long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); IList dictionary = null; List <int> indexes = null; List <int> repetitions = null; List <int> definitions = null; IList values = null; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, dataTypeHandler, out dictionary)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count); PageData pd = ReadDataPage(dataTypeHandler, ph, tse, maxValues - valuesSoFar); repetitions = AssignOrAdd(repetitions, pd.repetitions); definitions = AssignOrAdd(definitions, pd.definitions); indexes = AssignOrAdd(indexes, pd.indexes); values = AssignOrAdd(values, pd.values); pagesRead++; int totalCount = Math.Max( (values == null ? 0 : values.Count) + (indexes == null ? 0 : indexes.Count), (definitions == null ? 0 : definitions.Count)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } //IList mergedValues = new ValueMerger(_schema, values) // .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues); }
private bool TryReadDictionaryPage(Thrift.PageHeader ph, IDataTypeHandler dataTypeHandler, out IList dictionary) { if (ph.Type != Thrift.PageType.DICTIONARY_PAGE) { dictionary = null; return(false); } throw new NotImplementedException(); }
public DataColumn Read() { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); IList dictionary = null; List <int> indexes = null; List <int> repetitions = null; List <int> definitions = null; IList values = null; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out dictionary)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count); PageData pd = ReadDataPage(ph, maxValues - valuesSoFar); repetitions = AssignOrAdd(repetitions, pd.repetitions); definitions = AssignOrAdd(definitions, pd.definitions); indexes = AssignOrAdd(indexes, pd.indexes); values = AssignOrAdd(values, pd.values); pagesRead++; int totalCount = Math.Max( (values == null ? 0 : values.Count) + (indexes == null ? 0 : indexes.Count), (definitions == null ? 0 : definitions.Count)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } // all the data is available here! // todo: this is a simple hack for trivial tests to succeed return(new DataColumn(_dataField, values, definitions, repetitions)); }
public DataColumn Read() { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset); var colData = new ColumnRawData(); colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(colData.indexes == null ? 0 : colData.indexesOffset, colData.values == null ? 0 : colData.values.Length); ReadDataPage(ph, colData, maxValues - valuesSoFar); pagesRead++; int totalCount = Math.Max( (colData.values == null ? 0 : colData.values.Length) + (colData.indexes == null ? 0 : colData.indexesOffset), (colData.definitions == null ? 0 : colData.definitions.Length)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } // all the data is available here! return(new DataColumn( _dataField, colData.values, colData.definitions, _maxDefinitionLevel, colData.repetitions, _maxRepetitionLevel, colData.dictionary, colData.indexes)); }
public Thrift.PageHeader CreateDictionaryPage(int valueCount) { var ph = new Thrift.PageHeader(Thrift.PageType.DICTIONARY_PAGE, 0, 0); ph.Dictionary_page_header = new Thrift.DictionaryPageHeader { Encoding = Thrift.Encoding.PLAIN, Is_sorted = false, Num_values = valueCount }; return(ph); }
public Thrift.PageHeader CreateDataPage(int valueCount) { var ph = new Thrift.PageHeader(Thrift.PageType.DATA_PAGE, 0, 0); ph.Data_page_header = new Thrift.DataPageHeader { Encoding = Thrift.Encoding.PLAIN, Definition_level_encoding = Thrift.Encoding.RLE, Repetition_level_encoding = Thrift.Encoding.BIT_PACKED, Num_values = valueCount }; return(ph); }
private Thrift.ColumnChunk WriteColumnChunk(Thrift.SchemaElement tse, List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler) { Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, tse.Type, path, 0); Thrift.PageHeader ph = _footer.CreateDataPage(_rowCount); _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); List <PageTag> pages = WriteColumn(column, tse, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel); chunk.Meta_data.Num_values = ph.Data_page_header.Num_values; //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
private IList ReadDictionaryPage(Thrift.PageHeader ph) { //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain enncoding. byte[] data = ReadRawBytes(ph, _inputStream); using (var dataStream = new MemoryStream(data)) { using (var dataReader = new BinaryReader(dataStream)) { IList result = TypeFactory.Create(_schema, _options); _plainReader.Read(dataReader, _schema, result, int.MaxValue); return(result); } } }
private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = bytes.ToStream()) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length); using (var reader = new BinaryReader(ms)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values); } if (ph.Data_page_header == null) { throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt"); } // if statistics are defined, use null count to determine the exact number of items we should read // however, I don't know if all parquet files with null values have stats defined. Maybe a better solution would // be using a count of defined values (from reading definitions?) int maxReadCount = ph.Data_page_header.Num_values - (int)(ph.Data_page_header.Statistics?.Null_count ?? 0); ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, maxReadCount, cd); } } } }
private PageData ReadDataPage(Thrift.PageHeader ph, IList destination, long maxValues) { byte[] data = ReadRawBytes(ph, _inputStream); int max = ph.Data_page_header.Num_values; using (var dataStream = new MemoryStream(data)) { using (var reader = new BinaryReader(dataStream)) { List <int> repetitions = _schema.HasRepetitionLevelsPage ? ReadRepetitionLevels(reader) : null; List <int> definitions = _schema.HasDefinitionLevelsPage ? ReadDefinitionLevels(reader) : null; // these are pointers back to the Values table - lookup on values List <int> indexes = ReadColumnValues(reader, ph.Data_page_header.Encoding, destination, max); //trim output if it exceeds max number of values int numValues = ph.Data_page_header.Num_values; if (!_schema.IsRepeated) { if (repetitions != null) { ValueMerger.TrimTail(repetitions, numValues); } if (definitions != null) { ValueMerger.TrimTail(definitions, numValues); } if (indexes != null) { ValueMerger.TrimTail(indexes, numValues); } } return(new PageData { definitions = definitions, repetitions = repetitions, indexes = indexes }); } } }
public Thrift.ColumnChunk Write(int offset, int count, IList values) { if (values == null) { values = TypeFactory.Create(_schema.ElementType, _schema.IsNullable, _schema.IsRepeated); } Thrift.ColumnChunk chunk = _meta.AddColumnChunk(_compressionMethod, _output, _schema, values.Count); Thrift.PageHeader ph = _meta.CreateDataPage(values.Count); List <PageTag> pages = WriteValues(_schema, values, ph, _compressionMethod); //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
public Thrift.ColumnChunk Write(List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler) { Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, _schemaElement.Type, path, 0); Thrift.PageHeader ph = _footer.CreateDataPage(column.TotalCount); _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); List <PageTag> pages = WriteColumn(column, _schemaElement, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel); //this count must be set to number of all values in the column, including nulls. //for hierarchy/repeated columns this is a count of flattened list, including nulls. chunk.Meta_data.Num_values = ph.Data_page_header.Num_values; //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
private bool TryReadDictionaryPage(Thrift.PageHeader ph, out IList dictionary) { if (ph.Type != Thrift.PageType.DICTIONARY_PAGE) { dictionary = null; return(false); } //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding. using (Stream pageStream = OpenDataPageStream(ph)) { using (var dataReader = new BinaryReader(pageStream)) { dictionary = _dataTypeHandler.Read(_thriftSchemaElement, dataReader, _parquetOptions); return(true); } } }
private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = bytes.ToStream()) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length); using (var reader = new BinaryReader(ms)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values); } if (ph.Data_page_header == null) { throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt"); } ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ph.Data_page_header.Num_values, cd); } } } }
bool TryReadDictionaryPage(Thrift.PageHeader ph, out IList dictionary) { if (ph.Type != Thrift.PageType.DICTIONARY_PAGE) { dictionary = null; return(false); } //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding. byte[] data = ReadRawBytes(ph, _inputStream); using (var dataStream = new MemoryStream(data)) { using (var dataReader = new BinaryReader(dataStream)) { dictionary = _dataTypeHandler.Read(_thriftSchemaElement, dataReader, _parquetOptions); return(true); } } }
public ColumnarWriter(Stream output, ThriftStream thriftStream, ThriftFooter footer, Thrift.SchemaElement tse, List <string> path, CompressionMethod compressionMethod, ParquetOptions formatOptions, WriterOptions writerOptions) { _output = output; _thriftStream = thriftStream; _footer = footer; _tse = tse; _compressionMethod = compressionMethod; _formatOptions = formatOptions; _writerOptions = writerOptions; _dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions); _chunk = _footer.CreateColumnChunk(_compressionMethod, _output, _tse.Type, path, 0); _ph = _footer.CreateDataPage(0); _footer.GetLevels(_chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); _maxRepetitionLevel = maxRepetitionLevel; _maxDefinitionLevel = maxDefinitionLevel; }
private bool TryReadDictionaryPage(Thrift.PageHeader ph, out Array dictionary, out int dictionaryOffset) { if (ph.Type != Thrift.PageType.DICTIONARY_PAGE) { dictionary = null; dictionaryOffset = 0; return(false); } //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding. using (Stream pageStream = OpenDataPageStream(ph)) { using (var dataReader = new BinaryReader(pageStream)) { dictionary = _dataTypeHandler.GetArray((int)_thriftColumnChunk.Meta_data.Num_values, false, false); dictionaryOffset = _dataTypeHandler.Read(dataReader, _thriftSchemaElement, dictionary, 0, _parquetOptions); return(true); } } }