Ejemplo n.º 1
0
        private bool TryReadDictionaryPage(Thrift.PageHeader ph, out Array dictionary, out int dictionaryOffset)
        {
            if (ph.Type != Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionary       = null;
                dictionaryOffset = 0;
                return(false);
            }

            //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding.

            using (BytesOwner bytes = ReadPageData(ph))
            {
                //todo: this is ugly, but will be removed once other parts are migrated to System.Memory
                using (var ms = new MemoryStream(bytes.Memory.ToArray()))
                {
                    using (var dataReader = new BinaryReader(ms))
                    {
                        dictionary = _dataTypeHandler.GetArray(ph.Dictionary_page_header.Num_values, false, false);

                        dictionaryOffset = _dataTypeHandler.Read(dataReader, _thriftSchemaElement, dictionary, 0);

                        return(true);
                    }
                }
            }
        }
Ejemplo n.º 2
0
        private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues)
        {
            using (BytesOwner bytes = ReadPageData(ph))
            {
                //todo: this is ugly, but will be removed once other parts are migrated to System.Memory
                using (var ms = bytes.ToStream())
                {
                    ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length);

                    using (var reader = new BinaryReader(ms))
                    {
                        if (_maxRepetitionLevel > 0)
                        {
                            //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too.
                            if (cd.repetitions == null)
                            {
                                cd.repetitions = new int[cd.maxCount];
                            }

                            cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (_maxDefinitionLevel > 0)
                        {
                            if (cd.definitions == null)
                            {
                                cd.definitions = new int[cd.maxCount];
                            }

                            cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (ph.Data_page_header == null)
                        {
                            throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt");
                        }

                        // if statistics are defined, use null count to determine the exact number of items we should read
                        // however, I don't know if all parquet files with null values have stats defined. Maybe a better solution would
                        // be using a count of defined values (from reading definitions?)
                        int maxReadCount = ph.Data_page_header.Num_values - (int)(ph.Data_page_header.Statistics?.Null_count ?? 0);
                        ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, maxReadCount, cd);
                    }
                }
            }
        }
Ejemplo n.º 3
0
        private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues)
        {
            using (BytesOwner bytes = ReadPageData(ph))
            {
                //todo: this is ugly, but will be removed once other parts are migrated to System.Memory
                using (var ms = bytes.ToStream())
                {
                    ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length);

                    using (var reader = new BinaryReader(ms))
                    {
                        if (_maxRepetitionLevel > 0)
                        {
                            //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too.
                            if (cd.repetitions == null)
                            {
                                cd.repetitions = new int[cd.maxCount];
                            }

                            cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (_maxDefinitionLevel > 0)
                        {
                            if (cd.definitions == null)
                            {
                                cd.definitions = new int[cd.maxCount];
                            }

                            cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (ph.Data_page_header == null)
                        {
                            throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt");
                        }

                        ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ph.Data_page_header.Num_values, cd);
                    }
                }
            }
        }