private bool TryReadDictionaryPage(Thrift.PageHeader ph, out Array dictionary, out int dictionaryOffset) { if (ph.Type != Thrift.PageType.DICTIONARY_PAGE) { dictionary = null; dictionaryOffset = 0; return(false); } //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding. using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = new MemoryStream(bytes.Memory.ToArray())) { using (var dataReader = new BinaryReader(ms)) { dictionary = _dataTypeHandler.GetArray(ph.Dictionary_page_header.Num_values, false, false); dictionaryOffset = _dataTypeHandler.Read(dataReader, _thriftSchemaElement, dictionary, 0); return(true); } } } }
private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = bytes.ToStream()) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length); using (var reader = new BinaryReader(ms)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values); } if (ph.Data_page_header == null) { throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt"); } // if statistics are defined, use null count to determine the exact number of items we should read // however, I don't know if all parquet files with null values have stats defined. Maybe a better solution would // be using a count of defined values (from reading definitions?) int maxReadCount = ph.Data_page_header.Num_values - (int)(ph.Data_page_header.Statistics?.Null_count ?? 0); ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, maxReadCount, cd); } } } }
private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues) { using (BytesOwner bytes = ReadPageData(ph)) { //todo: this is ugly, but will be removed once other parts are migrated to System.Memory using (var ms = bytes.ToStream()) { ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length); using (var reader = new BinaryReader(ms)) { if (_maxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. if (cd.repetitions == null) { cd.repetitions = new int[cd.maxCount]; } cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values); } if (_maxDefinitionLevel > 0) { if (cd.definitions == null) { cd.definitions = new int[cd.maxCount]; } cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values); } if (ph.Data_page_header == null) { throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt"); } ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ph.Data_page_header.Num_values, cd); } } } }