예제 #1
0
        private PageData ReadDataPage(IDataTypeHandler dataTypeHandler, Thrift.PageHeader ph, Thrift.SchemaElement tse, long maxValues)
        {
            byte[] data = ReadRawBytes(ph, _inputStream);
            int    max  = ph.Data_page_header.Num_values;

            _footer.GetLevels(_thriftColumnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel);
            var pd = new PageData();

            using (var dataStream = new MemoryStream(data))
            {
                using (var reader = new BinaryReader(dataStream))
                {
                    if (maxRepetitionLevel > 0)
                    {
                        pd.repetitions = ReadLevels(reader, maxRepetitionLevel);
                    }

                    if (maxDefinitionLevel > 0)
                    {
                        pd.definitions = ReadLevels(reader, maxDefinitionLevel);
                    }

                    ReadColumn(dataTypeHandler, tse, reader, ph.Data_page_header.Encoding, maxValues,
                               out pd.values,
                               out pd.indexes);
                }
            }

            return(pd);
        }
예제 #2
0
        public ColumnarReader(Stream inputStream, Thrift.ColumnChunk thriftColumnChunk, ThriftFooter footer, ParquetOptions parquetOptions)
        {
            _inputStream       = inputStream ?? throw new ArgumentNullException(nameof(inputStream));
            _thriftColumnChunk = thriftColumnChunk ?? throw new ArgumentNullException(nameof(thriftColumnChunk));
            _footer            = footer ?? throw new ArgumentNullException(nameof(footer));
            _parquetOptions    = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions));

            _thriftStream = new ThriftStream(inputStream);
            _footer.GetLevels(_thriftColumnChunk, out int mrl, out int mdl);
            _maxRepetitionLevel  = mrl;
            _maxDefinitionLevel  = mdl;
            _thriftSchemaElement = _footer.GetSchemaElement(_thriftColumnChunk);
            _dataTypeHandler     = DataTypeFactory.Match(_thriftSchemaElement, _parquetOptions);
        }
        private Thrift.ColumnChunk WriteColumnChunk(Thrift.SchemaElement tse, List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler)
        {
            Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, tse.Type, path, 0);
            Thrift.PageHeader  ph    = _footer.CreateDataPage(_rowCount);
            _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);

            List <PageTag> pages = WriteColumn(column, tse, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel);

            chunk.Meta_data.Num_values = ph.Data_page_header.Num_values;

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
예제 #4
0
        public Thrift.ColumnChunk Write(List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler)
        {
            Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, _schemaElement.Type, path, 0);
            Thrift.PageHeader  ph    = _footer.CreateDataPage(column.TotalCount);
            _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);

            List <PageTag> pages = WriteColumn(column, _schemaElement, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel);

            //this count must be set to number of all values in the column, including nulls.
            //for hierarchy/repeated columns this is a count of flattened list, including nulls.
            chunk.Meta_data.Num_values = ph.Data_page_header.Num_values;

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
        public ColumnarWriter(Stream output, ThriftStream thriftStream,
                              ThriftFooter footer,
                              Thrift.SchemaElement tse, List <string> path,
                              CompressionMethod compressionMethod,
                              ParquetOptions formatOptions,
                              WriterOptions writerOptions)
        {
            _output            = output;
            _thriftStream      = thriftStream;
            _footer            = footer;
            _tse               = tse;
            _compressionMethod = compressionMethod;
            _formatOptions     = formatOptions;
            _writerOptions     = writerOptions;
            _dataTypeHandler   = DataTypeFactory.Match(tse, _formatOptions);

            _chunk = _footer.CreateColumnChunk(_compressionMethod, _output, _tse.Type, path, 0);
            _ph    = _footer.CreateDataPage(0);
            _footer.GetLevels(_chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);
            _maxRepetitionLevel = maxRepetitionLevel;
            _maxDefinitionLevel = maxDefinitionLevel;
        }