private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues)
        {
            using (Stream pageStream = OpenDataPageStream(ph))
            {
                ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), pageStream.Length);

                using (var reader = new BinaryReader(pageStream))
                {
                    if (_maxRepetitionLevel > 0)
                    {
                        //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too.
                        if (cd.repetitions == null)
                        {
                            cd.repetitions = new int[cd.maxCount];
                        }

                        cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset);
                    }

                    if (_maxDefinitionLevel > 0)
                    {
                        if (cd.definitions == null)
                        {
                            cd.definitions = new int[cd.maxCount];
                        }

                        cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset);
                    }

                    ReadColumn(reader, ph.Data_page_header.Encoding, maxValues,
                               ref cd.values, ref cd.valuesOffset,
                               ref cd.indexes, ref cd.indexesOffset);
                }
            }
        }
        public DataColumn Read()
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset);

            var colData = new ColumnRawData();

            colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int valuesSoFar = Math.Max(colData.indexes == null ? 0 : colData.indexesOffset, colData.values == null ? 0 : colData.values.Length);
                ReadDataPage(ph, colData, maxValues - valuesSoFar);

                pagesRead++;

                int totalCount = Math.Max(
                    (colData.values == null ? 0 : colData.values.Length) +
                    (colData.indexes == null ? 0 : colData.indexesOffset),
                    (colData.definitions == null ? 0 : colData.definitions.Length));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            // all the data is available here!

            return(new DataColumn(
                       _dataField, colData.values,
                       colData.definitions, _maxDefinitionLevel,
                       colData.repetitions, _maxRepetitionLevel,
                       colData.dictionary,
                       colData.indexes));
        }
예제 #3
0
        private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues)
        {
            using (BytesOwner bytes = ReadPageData(ph))
            {
                //todo: this is ugly, but will be removed once other parts are migrated to System.Memory
                using (var ms = bytes.ToStream())
                {
                    ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length);

                    using (var reader = new BinaryReader(ms))
                    {
                        if (_maxRepetitionLevel > 0)
                        {
                            //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too.
                            if (cd.repetitions == null)
                            {
                                cd.repetitions = new int[cd.maxCount];
                            }

                            cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (_maxDefinitionLevel > 0)
                        {
                            if (cd.definitions == null)
                            {
                                cd.definitions = new int[cd.maxCount];
                            }

                            cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (ph.Data_page_header == null)
                        {
                            throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt");
                        }

                        // if statistics are defined, use null count to determine the exact number of items we should read
                        // however, I don't know if all parquet files with null values have stats defined. Maybe a better solution would
                        // be using a count of defined values (from reading definitions?)
                        int maxReadCount = ph.Data_page_header.Num_values - (int)(ph.Data_page_header.Statistics?.Null_count ?? 0);
                        ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, maxReadCount, cd);
                    }
                }
            }
        }
예제 #4
0
        private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues)
        {
            using (BytesOwner bytes = ReadPageData(ph))
            {
                //todo: this is ugly, but will be removed once other parts are migrated to System.Memory
                using (var ms = bytes.ToStream())
                {
                    ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length);

                    using (var reader = new BinaryReader(ms))
                    {
                        if (_maxRepetitionLevel > 0)
                        {
                            //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too.
                            if (cd.repetitions == null)
                            {
                                cd.repetitions = new int[cd.maxCount];
                            }

                            cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (_maxDefinitionLevel > 0)
                        {
                            if (cd.definitions == null)
                            {
                                cd.definitions = new int[cd.maxCount];
                            }

                            cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (ph.Data_page_header == null)
                        {
                            throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt");
                        }

                        ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ph.Data_page_header.Num_values, cd);
                    }
                }
            }
        }
예제 #5
0
        public DataColumn Read()
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset);

            var colData = new ColumnRawData();

            colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                ReadDataPage(ph, colData, maxValues);

                pagesRead++;

                /*int totalValueCount =
                 * (colData.dictionary == null ? 0 : colData.indexesOffset) +
                 * (colData.values == null ? 0 : colData.valuesOffset);
                 *
                 * bool exhaused =
                 * (totalValueCount >= maxValues) &&
                 * (colData.definitions == null || colData.definitionsOffset >= maxValues);
                 *
                 * if (exhaused)
                 * break;*/

                int totalCount = Math.Max(
                    (colData.values == null ? 0 : colData.valuesOffset),
                    (colData.definitions == null ? 0 : colData.definitionsOffset));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            // all the data is available here!

            var finalColumn = new DataColumn(
                _dataField, colData.values,
                colData.definitions, _maxDefinitionLevel,
                colData.repetitions, _maxRepetitionLevel,
                colData.dictionary,
                colData.indexes);

            if (_thriftColumnChunk.Meta_data.Statistics != null)
            {
                finalColumn.Statistics = new DataColumnStatistics(
                    _thriftColumnChunk.Meta_data.Statistics.Null_count,
                    _thriftColumnChunk.Meta_data.Statistics.Distinct_count,
                    _dataTypeHandler.PlainDecode(_thriftSchemaElement, _thriftColumnChunk.Meta_data.Statistics.Min_value),
                    _dataTypeHandler.PlainDecode(_thriftSchemaElement, _thriftColumnChunk.Meta_data.Statistics.Max_value));
            }

            return(finalColumn);
        }
예제 #6
0
        private void ReadColumn(BinaryReader reader, Thrift.Encoding encoding, long totalValues, int maxReadCount, ColumnRawData cd)
        {
            //dictionary encoding uses RLE to encode data

            if (cd.values == null)
            {
                cd.values = _dataTypeHandler.GetArray((int)totalValues, false, false);
            }

            switch (encoding)
            {
            case Thrift.Encoding.PLAIN:
                cd.valuesOffset += _dataTypeHandler.Read(reader, _thriftSchemaElement, cd.values, cd.valuesOffset);
                break;

            case Thrift.Encoding.RLE:
                if (cd.indexes == null)
                {
                    cd.indexes = new int[(int)totalValues];
                }
                int indexCount = RunLengthBitPackingHybridValuesReader.Read(reader, _thriftSchemaElement.Type_length, cd.indexes, 0, maxReadCount);
                _dataTypeHandler.MergeDictionary(cd.dictionary, cd.indexes, cd.values, cd.valuesOffset, indexCount);
                cd.valuesOffset += indexCount;
                break;

            case Thrift.Encoding.PLAIN_DICTIONARY:
                if (cd.indexes == null)
                {
                    cd.indexes = new int[(int)totalValues];
                }
                indexCount = ReadPlainDictionary(reader, maxReadCount, cd.indexes, 0);
                _dataTypeHandler.MergeDictionary(cd.dictionary, cd.indexes, cd.values, cd.valuesOffset, indexCount);
                cd.valuesOffset += indexCount;
                break;

            default:
                throw new ParquetException($"encoding {encoding} is not supported.");
            }
        }