示例#1
0
        private List <PageTag> WriteColumn(DataColumn column,
                                           Thrift.SchemaElement tse,
                                           IDataTypeHandler dataTypeHandler,
                                           int maxRepetitionLevel,
                                           int maxDefinitionLevel)
        {
            var pages = new List <PageTag>();

            /*
             * Page header must preceeed actual data (compressed or not) however it contains both
             * the uncompressed and compressed data size which we don't know! This somehow limits
             * the write efficiency.
             */


            using (var ms = new MemoryStream())
            {
                Thrift.PageHeader dataPageHeader = _footer.CreateDataPage(column.TotalCount);

                //chain streams together so we have real streaming instead of wasting undefraggable LOH memory
                using (GapStream pageStream = DataStreamFactory.CreateWriter(ms, _compressionMethod, true))
                {
                    using (var writer = new BinaryWriter(pageStream, Encoding.UTF8, true))
                    {
                        if (maxRepetitionLevel > 0)
                        {
                            WriteLevels(writer, column.RepetitionLevels, maxRepetitionLevel);
                        }

                        if (maxDefinitionLevel > 0)
                        {
                            WriteLevels(writer, column.DefinitionLevels, maxDefinitionLevel);
                        }

                        dataTypeHandler.Write(tse, writer, column.DefinedData);

                        writer.Flush();
                    }

                    pageStream.Flush(); //extremely important to flush the stream as some compression algorithms don't finish writing
                    dataPageHeader.Uncompressed_page_size = (int)pageStream.Position;
                }
                dataPageHeader.Compressed_page_size = (int)ms.Position;

                //write the header in
                int headerSize = _thriftStream.Write(dataPageHeader);
                ms.Position = 0;
                ms.CopyTo(_stream);

                var dataTag = new PageTag
                {
                    HeaderMeta = dataPageHeader,
                    HeaderSize = headerSize
                };

                pages.Add(dataTag);
            }

            return(pages);
        }
        private PageData ReadDataPage(Thrift.PageHeader ph, long maxValues)
        {
            int max = ph.Data_page_header.Num_values;

            var pd = new PageData();

            using (Stream pageStream = OpenDataPageStream(ph))
            {
                using (var reader = new BinaryReader(pageStream))
                {
                    if (_maxRepetitionLevel > 0)
                    {
                        pd.repetitions = ReadLevels(reader, _maxRepetitionLevel, max);
                    }

                    if (_maxDefinitionLevel > 0)
                    {
                        pd.definitions = ReadLevels(reader, _maxDefinitionLevel, max);
                    }

                    ReadColumn(reader, ph.Data_page_header.Encoding, maxValues,
                               out pd.values,
                               out pd.indexes);
                }
            }

            return(pd);
        }
示例#3
0
        private byte[] Compress(Thrift.PageHeader ph, byte[] data, CompressionMethod compression)
        {
            //note that page size numbers do not include header size by spec

            ph.Uncompressed_page_size = data.Length;
            byte[] result;

            if (compression != CompressionMethod.None)
            {
                IDataWriter writer = DataFactory.GetWriter(compression);
                using (var ms = new MemoryStream())
                {
                    writer.Write(data, ms);
                    result = ms.ToArray();
                }
                ph.Compressed_page_size = result.Length;
            }
            else
            {
                ph.Compressed_page_size = ph.Uncompressed_page_size;
                result = data;
            }

            return(result);
        }
示例#4
0
        private PageData ReadDataPage(Thrift.PageHeader ph, IList destination, long maxValues)
        {
            byte[] data = ReadRawBytes(ph, _inputStream);
            int    max  = ph.Data_page_header.Num_values;

            using (var dataStream = new MemoryStream(data))
            {
                using (var reader = new BinaryReader(dataStream))
                {
                    List <int> repetitions = _schema.HasRepetitionLevelsPage
                  ? ReadRepetitionLevels(reader)
                  : null;

                    List <int> definitions = _schema.HasDefinitionLevelsPage
                  ? ReadDefinitionLevels(reader)
                  : null;

                    // these are pointers back to the Values table - lookup on values
                    List <int> indexes = ReadColumnValues(reader, ph.Data_page_header.Encoding, destination, max);

                    return(new PageData {
                        definitions = definitions, repetitions = repetitions, indexes = indexes
                    });
                }
            }
        }
示例#5
0
        public IList Read(long offset, long count)
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            IList      dictionary  = null;
            List <int> indexes     = null;
            List <int> repetitions = null;
            List <int> definitions = null;
            IList      values      = null;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out dictionary))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int      valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count);
                PageData pd          = ReadDataPage(ph, maxValues - valuesSoFar);

                repetitions = AssignOrAdd(repetitions, pd.repetitions);
                definitions = AssignOrAdd(definitions, pd.definitions);
                indexes     = AssignOrAdd(indexes, pd.indexes);
                values      = AssignOrAdd(values, pd.values);

                pagesRead++;

                int totalCount = Math.Max(
                    (values == null ? 0 : values.Count) +
                    (indexes == null ? 0 : indexes.Count),
                    (definitions == null ? 0 : definitions.Count));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            IList mergedValues = new ValueMerger(
                _maxDefinitionLevel,
                _maxRepetitionLevel,
                () => _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0),
                values ?? _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0))
                                 .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues);

            mergedValues.Trim((int)offset, (int)count);

            return(mergedValues);
        }
示例#6
0
        private int Write(Thrift.PageHeader ph, byte[] data)
        {
            int headerSize = _thriftStream.Write(ph);

            _output.Write(data, 0, data.Length);
            return(headerSize);
        }
示例#7
0
        private bool TryReadDictionaryPage(Thrift.PageHeader ph, out Array dictionary, out int dictionaryOffset)
        {
            if (ph.Type != Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionary       = null;
                dictionaryOffset = 0;
                return(false);
            }

            //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding.

            using (BytesOwner bytes = ReadPageData(ph))
            {
                //todo: this is ugly, but will be removed once other parts are migrated to System.Memory
                using (var ms = new MemoryStream(bytes.Memory.ToArray()))
                {
                    using (var dataReader = new BinaryReader(ms))
                    {
                        dictionary = _dataTypeHandler.GetArray(ph.Dictionary_page_header.Num_values, false, false);

                        dictionaryOffset = _dataTypeHandler.Read(dataReader, _thriftSchemaElement, dictionary, 0);

                        return(true);
                    }
                }
            }
        }
示例#8
0
        PageData ReadDataPage(Thrift.PageHeader ph, long maxValues)
        {
            byte[] data = ReadRawBytes(ph, _inputStream);
            int    max  = ph.Data_page_header.Num_values;

            var pd = new PageData();

            using (var dataStream = new MemoryStream(data))
            {
                using (var reader = new BinaryReader(dataStream))
                {
                    if (_maxRepetitionLevel > 0)
                    {
                        pd.repetitions = ReadLevels(reader, _maxRepetitionLevel, max);
                    }

                    if (_maxDefinitionLevel > 0)
                    {
                        pd.definitions = ReadLevels(reader, _maxDefinitionLevel, max);
                    }

                    ReadColumn(reader, ph.Data_page_header.Encoding, maxValues,
                               out pd.values,
                               out pd.indexes);
                }
            }

            return(pd);
        }
示例#9
0
        private PageData ReadDataPage(IDataTypeHandler dataTypeHandler, Thrift.PageHeader ph, Thrift.SchemaElement tse, long maxValues)
        {
            byte[] data = ReadRawBytes(ph, _inputStream);
            int    max  = ph.Data_page_header.Num_values;

            _footer.GetLevels(_thriftColumnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel);
            var pd = new PageData();

            using (var dataStream = new MemoryStream(data))
            {
                using (var reader = new BinaryReader(dataStream))
                {
                    if (maxRepetitionLevel > 0)
                    {
                        pd.repetitions = ReadLevels(reader, maxRepetitionLevel);
                    }

                    if (maxDefinitionLevel > 0)
                    {
                        pd.definitions = ReadLevels(reader, maxDefinitionLevel);
                    }

                    ReadColumn(dataTypeHandler, tse, reader, ph.Data_page_header.Encoding, maxValues,
                               out pd.values,
                               out pd.indexes);
                }
            }

            return(pd);
        }
        private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues)
        {
            using (Stream pageStream = OpenDataPageStream(ph))
            {
                ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), pageStream.Length);

                using (var reader = new BinaryReader(pageStream))
                {
                    if (_maxRepetitionLevel > 0)
                    {
                        //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too.
                        if (cd.repetitions == null)
                        {
                            cd.repetitions = new int[cd.maxCount];
                        }

                        cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset);
                    }

                    if (_maxDefinitionLevel > 0)
                    {
                        if (cd.definitions == null)
                        {
                            cd.definitions = new int[cd.maxCount];
                        }

                        cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset);
                    }

                    ReadColumn(reader, ph.Data_page_header.Encoding, maxValues,
                               ref cd.values, ref cd.valuesOffset,
                               ref cd.indexes, ref cd.indexesOffset);
                }
            }
        }
示例#11
0
        private byte[] ReadRawBytes(Thrift.PageHeader ph, Stream inputStream)
        {
            Thrift.CompressionCodec thriftCodec = _thriftChunk.Meta_data.Codec;
            IDataReader             reader      = DataFactory.GetReader(thriftCodec);

            return(reader.Read(inputStream, ph.Compressed_page_size));
        }
        private Stream OpenDataPageStream(Thrift.PageHeader pageHeader)
        {
            var window = new WindowedStream(_inputStream, pageHeader.Compressed_page_size);

            Stream uncompressed = DataStreamFactory.CreateReader(window, _thriftColumnChunk.Meta_data.Codec, pageHeader.Uncompressed_page_size);

            return(uncompressed);
        }
        private List <PageTag> WriteColumn(DataColumn column,
                                           Thrift.SchemaElement tse,
                                           IDataTypeHandler dataTypeHandler,
                                           int maxRepetitionLevel,
                                           int maxDefinitionLevel)
        {
            var pages = new List <PageTag>();

            /*
             * Page header must preceeed actual data (compressed or not) however it contains both
             * the uncompressed and compressed data size which we don't know! This somehow limits
             * the write efficiency.
             */


            using (var ms = new MemoryStream())
            {
                Thrift.PageHeader dataPageHeader = _footer.CreateDataPage(column.TotalCount);

                //chain streams together so we have real streaming instead of wasting undefraggable LOH memory
                using (PositionTrackingStream pps = DataStreamFactory.CreateWriter(ms, _compressionMethod))
                {
                    using (var writer = new BinaryWriter(pps))
                    {
                        if (column.HasRepetitions)
                        {
                            throw new NotImplementedException();
                        }

                        if (column.HasDefinitions)
                        {
                            WriteLevels(writer, column.DefinitionLevels, maxDefinitionLevel);
                        }

                        dataTypeHandler.Write(tse, writer, column.DefinedData);
                    }

                    dataPageHeader.Uncompressed_page_size = (int)pps.Position;
                }
                dataPageHeader.Compressed_page_size = (int)ms.Position;

                //write the hader in
                int headerSize = _thriftStream.Write(dataPageHeader);
                ms.Position = 0;
                ms.CopyTo(_stream);

                var dataTag = new PageTag
                {
                    HeaderMeta = dataPageHeader,
                    HeaderSize = headerSize
                };

                pages.Add(dataTag);
            }

            return(pages);
        }
示例#14
0
        public void Read(long offset, long count)
        {
            Thrift.SchemaElement tse = _footer.GetSchemaElement(_thriftColumnChunk);

            IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _parquetOptions);

            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            IList      dictionary  = null;
            List <int> indexes     = null;
            List <int> repetitions = null;
            List <int> definitions = null;
            IList      values      = null;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, dataTypeHandler, out dictionary))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int      valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count);
                PageData pd          = ReadDataPage(dataTypeHandler, ph, tse, maxValues - valuesSoFar);

                repetitions = AssignOrAdd(repetitions, pd.repetitions);
                definitions = AssignOrAdd(definitions, pd.definitions);
                indexes     = AssignOrAdd(indexes, pd.indexes);
                values      = AssignOrAdd(values, pd.values);

                pagesRead++;

                int totalCount = Math.Max(
                    (values == null ? 0 : values.Count) +
                    (indexes == null ? 0 : indexes.Count),
                    (definitions == null ? 0 : definitions.Count));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            //IList mergedValues = new ValueMerger(_schema, values)
            //   .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues);
        }
示例#15
0
        private bool TryReadDictionaryPage(Thrift.PageHeader ph, IDataTypeHandler dataTypeHandler, out IList dictionary)
        {
            if (ph.Type != Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionary = null;
                return(false);
            }

            throw new NotImplementedException();
        }
        public DataColumn Read()
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            IList      dictionary  = null;
            List <int> indexes     = null;
            List <int> repetitions = null;
            List <int> definitions = null;
            IList      values      = null;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out dictionary))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int      valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count);
                PageData pd          = ReadDataPage(ph, maxValues - valuesSoFar);

                repetitions = AssignOrAdd(repetitions, pd.repetitions);
                definitions = AssignOrAdd(definitions, pd.definitions);
                indexes     = AssignOrAdd(indexes, pd.indexes);
                values      = AssignOrAdd(values, pd.values);

                pagesRead++;

                int totalCount = Math.Max(
                    (values == null ? 0 : values.Count) +
                    (indexes == null ? 0 : indexes.Count),
                    (definitions == null ? 0 : definitions.Count));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            // all the data is available here!

            // todo: this is a simple hack for trivial tests to succeed
            return(new DataColumn(_dataField, values, definitions, repetitions));
        }
        public DataColumn Read()
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset);

            var colData = new ColumnRawData();

            colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int valuesSoFar = Math.Max(colData.indexes == null ? 0 : colData.indexesOffset, colData.values == null ? 0 : colData.values.Length);
                ReadDataPage(ph, colData, maxValues - valuesSoFar);

                pagesRead++;

                int totalCount = Math.Max(
                    (colData.values == null ? 0 : colData.values.Length) +
                    (colData.indexes == null ? 0 : colData.indexesOffset),
                    (colData.definitions == null ? 0 : colData.definitions.Length));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            // all the data is available here!

            return(new DataColumn(
                       _dataField, colData.values,
                       colData.definitions, _maxDefinitionLevel,
                       colData.repetitions, _maxRepetitionLevel,
                       colData.dictionary,
                       colData.indexes));
        }
示例#18
0
        public Thrift.PageHeader CreateDictionaryPage(int valueCount)
        {
            var ph = new Thrift.PageHeader(Thrift.PageType.DICTIONARY_PAGE, 0, 0);

            ph.Dictionary_page_header = new Thrift.DictionaryPageHeader
            {
                Encoding   = Thrift.Encoding.PLAIN,
                Is_sorted  = false,
                Num_values = valueCount
            };
            return(ph);
        }
示例#19
0
        public Thrift.PageHeader CreateDataPage(int valueCount)
        {
            var ph = new Thrift.PageHeader(Thrift.PageType.DATA_PAGE, 0, 0);

            ph.Data_page_header = new Thrift.DataPageHeader
            {
                Encoding = Thrift.Encoding.PLAIN,
                Definition_level_encoding = Thrift.Encoding.RLE,
                Repetition_level_encoding = Thrift.Encoding.BIT_PACKED,
                Num_values = valueCount
            };

            return(ph);
        }
        private Thrift.ColumnChunk WriteColumnChunk(Thrift.SchemaElement tse, List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler)
        {
            Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, tse.Type, path, 0);
            Thrift.PageHeader  ph    = _footer.CreateDataPage(_rowCount);
            _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);

            List <PageTag> pages = WriteColumn(column, tse, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel);

            chunk.Meta_data.Num_values = ph.Data_page_header.Num_values;

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
示例#21
0
        private IList ReadDictionaryPage(Thrift.PageHeader ph)
        {
            //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain enncoding.

            byte[] data = ReadRawBytes(ph, _inputStream);

            using (var dataStream = new MemoryStream(data))
            {
                using (var dataReader = new BinaryReader(dataStream))
                {
                    IList result = TypeFactory.Create(_schema, _options);
                    _plainReader.Read(dataReader, _schema, result, int.MaxValue);
                    return(result);
                }
            }
        }
示例#22
0
        private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues)
        {
            using (BytesOwner bytes = ReadPageData(ph))
            {
                //todo: this is ugly, but will be removed once other parts are migrated to System.Memory
                using (var ms = bytes.ToStream())
                {
                    ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length);

                    using (var reader = new BinaryReader(ms))
                    {
                        if (_maxRepetitionLevel > 0)
                        {
                            //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too.
                            if (cd.repetitions == null)
                            {
                                cd.repetitions = new int[cd.maxCount];
                            }

                            cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (_maxDefinitionLevel > 0)
                        {
                            if (cd.definitions == null)
                            {
                                cd.definitions = new int[cd.maxCount];
                            }

                            cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (ph.Data_page_header == null)
                        {
                            throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt");
                        }

                        // if statistics are defined, use null count to determine the exact number of items we should read
                        // however, I don't know if all parquet files with null values have stats defined. Maybe a better solution would
                        // be using a count of defined values (from reading definitions?)
                        int maxReadCount = ph.Data_page_header.Num_values - (int)(ph.Data_page_header.Statistics?.Null_count ?? 0);
                        ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, maxReadCount, cd);
                    }
                }
            }
        }
示例#23
0
        private PageData ReadDataPage(Thrift.PageHeader ph, IList destination, long maxValues)
        {
            byte[] data = ReadRawBytes(ph, _inputStream);
            int    max  = ph.Data_page_header.Num_values;

            using (var dataStream = new MemoryStream(data))
            {
                using (var reader = new BinaryReader(dataStream))
                {
                    List <int> repetitions = _schema.HasRepetitionLevelsPage
                  ? ReadRepetitionLevels(reader)
                  : null;

                    List <int> definitions = _schema.HasDefinitionLevelsPage
                  ? ReadDefinitionLevels(reader)
                  : null;

                    // these are pointers back to the Values table - lookup on values
                    List <int> indexes = ReadColumnValues(reader, ph.Data_page_header.Encoding, destination, max);

                    //trim output if it exceeds max number of values
                    int numValues = ph.Data_page_header.Num_values;

                    if (!_schema.IsRepeated)
                    {
                        if (repetitions != null)
                        {
                            ValueMerger.TrimTail(repetitions, numValues);
                        }
                        if (definitions != null)
                        {
                            ValueMerger.TrimTail(definitions, numValues);
                        }
                        if (indexes != null)
                        {
                            ValueMerger.TrimTail(indexes, numValues);
                        }
                    }

                    return(new PageData {
                        definitions = definitions, repetitions = repetitions, indexes = indexes
                    });
                }
            }
        }
示例#24
0
        public Thrift.ColumnChunk Write(int offset, int count, IList values)
        {
            if (values == null)
            {
                values = TypeFactory.Create(_schema.ElementType, _schema.IsNullable, _schema.IsRepeated);
            }

            Thrift.ColumnChunk chunk = _meta.AddColumnChunk(_compressionMethod, _output, _schema, values.Count);
            Thrift.PageHeader  ph    = _meta.CreateDataPage(values.Count);

            List <PageTag> pages = WriteValues(_schema, values, ph, _compressionMethod);

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
示例#25
0
        public Thrift.ColumnChunk Write(List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler)
        {
            Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, _schemaElement.Type, path, 0);
            Thrift.PageHeader  ph    = _footer.CreateDataPage(column.TotalCount);
            _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);

            List <PageTag> pages = WriteColumn(column, _schemaElement, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel);

            //this count must be set to number of all values in the column, including nulls.
            //for hierarchy/repeated columns this is a count of flattened list, including nulls.
            chunk.Meta_data.Num_values = ph.Data_page_header.Num_values;

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
        private bool TryReadDictionaryPage(Thrift.PageHeader ph, out IList dictionary)
        {
            if (ph.Type != Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionary = null;
                return(false);
            }

            //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding.

            using (Stream pageStream = OpenDataPageStream(ph))
            {
                using (var dataReader = new BinaryReader(pageStream))
                {
                    dictionary = _dataTypeHandler.Read(_thriftSchemaElement, dataReader, _parquetOptions);
                    return(true);
                }
            }
        }
示例#27
0
        private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues)
        {
            using (BytesOwner bytes = ReadPageData(ph))
            {
                //todo: this is ugly, but will be removed once other parts are migrated to System.Memory
                using (var ms = bytes.ToStream())
                {
                    ParquetEventSource.Current.OpenDataPage(_dataField.Path, _thriftColumnChunk.Meta_data.Codec.ToString(), ms.Length);

                    using (var reader = new BinaryReader(ms))
                    {
                        if (_maxRepetitionLevel > 0)
                        {
                            //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too.
                            if (cd.repetitions == null)
                            {
                                cd.repetitions = new int[cd.maxCount];
                            }

                            cd.repetitionsOffset += ReadLevels(reader, _maxRepetitionLevel, cd.repetitions, cd.repetitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (_maxDefinitionLevel > 0)
                        {
                            if (cd.definitions == null)
                            {
                                cd.definitions = new int[cd.maxCount];
                            }

                            cd.definitionsOffset += ReadLevels(reader, _maxDefinitionLevel, cd.definitions, cd.definitionsOffset, ph.Data_page_header.Num_values);
                        }

                        if (ph.Data_page_header == null)
                        {
                            throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt");
                        }

                        ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ph.Data_page_header.Num_values, cd);
                    }
                }
            }
        }
示例#28
0
        bool TryReadDictionaryPage(Thrift.PageHeader ph, out IList dictionary)
        {
            if (ph.Type != Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionary = null;
                return(false);
            }

            //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding.

            byte[] data = ReadRawBytes(ph, _inputStream);

            using (var dataStream = new MemoryStream(data))
            {
                using (var dataReader = new BinaryReader(dataStream))
                {
                    dictionary = _dataTypeHandler.Read(_thriftSchemaElement, dataReader, _parquetOptions);
                    return(true);
                }
            }
        }
        public ColumnarWriter(Stream output, ThriftStream thriftStream,
                              ThriftFooter footer,
                              Thrift.SchemaElement tse, List <string> path,
                              CompressionMethod compressionMethod,
                              ParquetOptions formatOptions,
                              WriterOptions writerOptions)
        {
            _output            = output;
            _thriftStream      = thriftStream;
            _footer            = footer;
            _tse               = tse;
            _compressionMethod = compressionMethod;
            _formatOptions     = formatOptions;
            _writerOptions     = writerOptions;
            _dataTypeHandler   = DataTypeFactory.Match(tse, _formatOptions);

            _chunk = _footer.CreateColumnChunk(_compressionMethod, _output, _tse.Type, path, 0);
            _ph    = _footer.CreateDataPage(0);
            _footer.GetLevels(_chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);
            _maxRepetitionLevel = maxRepetitionLevel;
            _maxDefinitionLevel = maxDefinitionLevel;
        }
        private bool TryReadDictionaryPage(Thrift.PageHeader ph, out Array dictionary, out int dictionaryOffset)
        {
            if (ph.Type != Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionary       = null;
                dictionaryOffset = 0;
                return(false);
            }

            //Dictionary page format: the entries in the dictionary - in dictionary order - using the plain encoding.

            using (Stream pageStream = OpenDataPageStream(ph))
            {
                using (var dataReader = new BinaryReader(pageStream))
                {
                    dictionary = _dataTypeHandler.GetArray((int)_thriftColumnChunk.Meta_data.Num_values, false, false);

                    dictionaryOffset = _dataTypeHandler.Read(dataReader, _thriftSchemaElement, dictionary, 0, _parquetOptions);

                    return(true);
                }
            }
        }