private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (Stream pageStream = OpenDataPageStream(ph)) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), pageStream.Length); using (var reader = new BinaryReader(pageStream)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset); } ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ref cd.values, ref cd.valuesOffset, ref cd.indexes, ref cd.indexesOffset); } } }
public DataColumn Read() { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset); var colData = new ColumnRawData(); colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(colData.indexes == null ? 0 : colData.indexesOffset, colData.values == null ? 0 : colData.values.Length); ReadDataPage(ph, colData, maxValues - valuesSoFar); pagesRead++; int totalCount = Math.Max( (colData.values == null ? 0 : colData.values.Length) + (colData.indexes == null ? 0 : colData.indexesOffset), (colData.definitions == null ? 0 : colData.definitions.Length)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } // all the data is available here! return(new DataColumn( _dataField, colData.values, colData.definitions, _maxDefinitionLevel, colData.repetitions, _maxRepetitionLevel, colData.dictionary, colData.indexes)); }
private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = bytes.ToStream()) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length); using (var reader = new BinaryReader(ms)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values); } if (ph.Data_page_header == null) { throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt"); } // if statistics are defined, use null count to determine the exact number of items we should read // however, I don't know if all parquet files with null values have stats defined. Maybe a better solution would // be using a count of defined values (from reading definitions?) int maxReadCount = ph.Data_page_header.Num_values - (int)(ph.Data_page_header.Statistics?.Null_count ?? 0); ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, maxReadCount, cd); } } } }
private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = bytes.ToStream()) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length); using (var reader = new BinaryReader(ms)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values); } if (ph.Data_page_header == null) { throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt"); } ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ph.Data_page_header.Num_values, cd); } } } }
public DataColumn Read() { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset); var colData = new ColumnRawData(); colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { ReadDataPage(ph, colData, maxValues); pagesRead++; /*int totalValueCount = * (colData.dictionary == null ? 0 : colData.indexesOffset) + * (colData.values == null ? 0 : colData.valuesOffset); * * bool exhaused = * (totalValueCount >= maxValues) && * (colData.definitions == null || colData.definitionsOffset >= maxValues); * * if (exhaused) * break;*/ int totalCount = Math.Max( (colData.values == null ? 0 : colData.valuesOffset), (colData.definitions == null ? 0 : colData.definitionsOffset)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } // all the data is available here! var finalColumn = new DataColumn( _dataField, colData.values, colData.definitions, _maxDefinitionLevel, colData.repetitions, _maxRepetitionLevel, colData.dictionary, colData.indexes); if (_thriftColumnChunk.Meta_data.Statistics != null) { finalColumn.Statistics = new DataColumnStatistics( _thriftColumnChunk.Meta_data.Statistics.Null_count, _thriftColumnChunk.Meta_data.Statistics.Distinct_count, _dataTypeHandler.PlainDecode(_thriftSchemaElement, _thriftColumnChunk.Meta_data.Statistics.Min_value), _dataTypeHandler.PlainDecode(_thriftSchemaElement, _thriftColumnChunk.Meta_data.Statistics.Max_value)); } return(finalColumn); }
private void ReadColumn(BinaryReader reader, Thrift.Encoding encoding, long totalValues, int maxReadCount, ColumnRawData cd) { //dictionary encoding uses RLE to encode data if (cd.values == null) { cd.values = _dataTypeHandler.GetArray((int)totalValues, false, false); } switch (encoding) { case Thrift.Encoding.PLAIN: cd.valuesOffset += _dataTypeHandler.Read(reader, _thriftSchemaElement, cd.values, cd.valuesOffset); break; case Thrift.Encoding.RLE: if (cd.indexes == null) { cd.indexes = new int[(int)totalValues]; } int indexCount = RunLengthBitPackingHybridValuesReader.Read(reader, _thriftSchemaElement.Type_length, cd.indexes, 0, maxReadCount); _dataTypeHandler.MergeDictionary(cd.dictionary, cd.indexes, cd.values, cd.valuesOffset, indexCount); cd.valuesOffset += indexCount; break; case Thrift.Encoding.PLAIN_DICTIONARY: if (cd.indexes == null) { cd.indexes = new int[(int)totalValues]; } indexCount = ReadPlainDictionary(reader, maxReadCount, cd.indexes, 0); _dataTypeHandler.MergeDictionary(cd.dictionary, cd.indexes, cd.values, cd.valuesOffset, indexCount); cd.valuesOffset += indexCount; break; default: throw new ParquetException($"encoding {encoding} is not supported."); } }