private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = bytes.ToStream()) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length); using (var reader = new BinaryReader(ms)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values); } if (ph.Data_page_header == null) { throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt"); } // if statistics are defined, use null count to determine the exact number of items we should read // however, I don't know if all parquet files with null values have stats defined. Maybe a better solution would // be using a count of defined values (from reading definitions?) int maxReadCount = ph.Data_page_header.Num_values - (int)(ph.Data_page_header.Statistics?.Null_count ?? 0); ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, maxReadCount, cd); } } } }
private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = bytes.ToStream()) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length); using (var reader = new BinaryReader(ms)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values); } if (ph.Data_page_header == null) { throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt"); } ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ph.Data_page_header.Num_values, cd); } } } }