Пример #1
0
        /// <summary>
        /// Write out dataset to the output stream
        /// </summary>
        /// <param name="dataSet">Dataset to write</param>
        /// <param name="compression">Compression method</param>
        /// <param name="append">When true, appends to the file, otherwise creates a new file.</param>
        public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false)
        {
            PrepareFile(dataSet, append);

            var stats = new DataSetStats(dataSet);

            int offset = 0;
            int count;

            do
            {
                count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset);
                Thrift.RowGroup rg         = _meta.AddRowGroup();
                long            rgStartPos = _output.Position;
                rg.Columns = dataSet.Schema.Elements
                             .Select(c =>
                                     Write(c, dataSet.GetColumn(c.Name, offset, count), compression, stats.GetColumnStats(c)))
                             .ToList();

                //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers
                //luckily ColumnChunk already contains sizes of page+header in it's meta
                rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size);
                rg.Num_rows        = count;

                offset += _writerOptions.RowGroupsSize;
            }while (offset < dataSet.Count);

            _dataWritten = true;
        }
Пример #2
0
        /// <summary>
        /// Write out dataset to the output stream
        /// </summary>
        /// <param name="dataSet">Dataset to write</param>
        /// <param name="compression">Compression method</param>
        /// <param name="append">When true, appends to the file, otherwise creates a new file.</param>
        public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false)
        {
            PrepareFile(dataSet, append);

            int offset = 0;
            int count;

            do
            {
                count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset);
                Thrift.RowGroup rg         = _meta.AddRowGroup();
                long            rgStartPos = Stream.Position;

                rg.Columns = new List <Thrift.ColumnChunk>();
                foreach (SchemaElement se in dataSet.Schema.Flatten())
                {
                    var   cw                 = new ColumnWriter(Stream, ThriftStream, _meta, se, compression, _formatOptions, _writerOptions);
                    IList values             = dataSet.GetColumn(se, offset, count);
                    Thrift.ColumnChunk chunk = cw.Write(offset, count, values);
                    rg.Columns.Add(chunk);
                }

                //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers
                //luckily ColumnChunk already contains sizes of page+header in it's meta
                rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size);
                rg.Num_rows        = count;

                offset += _writerOptions.RowGroupsSize;
            }while (offset < dataSet.Count);

            _dataWritten = true;
        }
        internal ParquetRowGroupReader(
            Thrift.RowGroup rowGroup,
            ThriftFooter footer,
            Stream stream, ThriftStream thriftStream,
            ParquetOptions parquetOptions)
        {
            _rowGroup       = rowGroup ?? throw new ArgumentNullException(nameof(rowGroup));
            _footer         = footer ?? throw new ArgumentNullException(nameof(footer));
            _stream         = stream ?? throw new ArgumentNullException(nameof(stream));
            _thriftStream   = thriftStream ?? throw new ArgumentNullException(nameof(thriftStream));
            _parquetOptions = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions));

            //cache chunks
            foreach (Thrift.ColumnChunk thriftChunk in _rowGroup.Columns)
            {
                string path = thriftChunk.GetPath();
                _pathToChunk[path] = thriftChunk;
            }
        }
Пример #4
0
        internal ParquetRowGroupWriter(Schema schema,
                                       Stream stream,
                                       ThriftStream thriftStream,
                                       ThriftFooter footer,
                                       CompressionMethod compressionMethod,
                                       ParquetOptions formatOptions)
        {
            _schema            = schema ?? throw new ArgumentNullException(nameof(schema));
            _stream            = stream ?? throw new ArgumentNullException(nameof(stream));
            _thriftStream      = thriftStream ?? throw new ArgumentNullException(nameof(thriftStream));
            _footer            = footer ?? throw new ArgumentNullException(nameof(footer));
            _compressionMethod = compressionMethod;
            _formatOptions     = formatOptions;

            _thriftRowGroup         = _footer.AddRowGroup();
            _rgStartPos             = _stream.Position;
            _thriftRowGroup.Columns = new List <Thrift.ColumnChunk>();
            _thschema = _footer.GetWriteableSchema();
        }
Пример #5
0
        /// <summary>
        /// Write out dataset to the output stream
        /// </summary>
        /// <param name="dataSet">Dataset to write</param>
        /// <param name="compression">Compression method</param>
        public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip)
        {
            _meta.AddSchema(dataSet);

            var stats = new DataSetStats(dataSet);

            long totalCount = dataSet.Count;

            Thrift.RowGroup rg         = _meta.AddRowGroup();
            long            rgStartPos = _output.Position;

            rg.Columns = dataSet.Schema.Elements
                         .Select(c =>
                                 Write(c, dataSet.GetColumn(c.Name), compression, stats.GetColumnStats(c)))
                         .ToList();

            //row group's size is a sum of _uncompressed_ sizes of all columns in it
            rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_uncompressed_size);
            rg.Num_rows        = dataSet.Count;

            _dataWritten = true;
        }
Пример #6
0
        /// <summary>
        /// Write out dataset to the output stream
        /// </summary>
        /// <param name="dataSet">Dataset to write</param>
        /// <param name="compression">Compression method</param>
        /// <param name="append">When true, appends to the file, otherwise creates a new file.</param>
        public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false)
        {
            PrepareFile(dataSet, append);
            _footer.CustomMetadata = dataSet.Metadata.Custom;

            int offset = 0;
            int count;
            List <Thrift.SchemaElement> writeableSchema = _footer.GetWriteableSchema().ToList();

            do
            {
                count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset);
                Thrift.RowGroup rg         = _footer.AddRowGroup();
                long            rgStartPos = Stream.Position;

                rg.Columns = new List <Thrift.ColumnChunk>();

                foreach (Thrift.SchemaElement tse in writeableSchema)
                {
                    List <string> path     = _footer.GetPath(tse);
                    string        flatPath = string.Join(Schema.PathSeparator, path);
                    var           cw       = new ColumnarWriter(Stream, ThriftStream, _footer, tse, path, compression, _formatOptions, _writerOptions);

                    IList values             = dataSet.GetColumn(flatPath, offset, count);
                    Thrift.ColumnChunk chunk = cw.Write(offset, count, values);
                    rg.Columns.Add(chunk);
                }

                //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers
                //luckily ColumnChunk already contains sizes of page+header in it's meta
                rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size);
                rg.Num_rows        = count;

                offset += _writerOptions.RowGroupsSize;
            }while (offset < dataSet.Count);

            _dataWritten = true;
        }
Пример #7
0
        public void Read(TProtocol iprot)
        {
            iprot.IncrementRecursionDepth();
            try
            {
                bool   isset_version    = false;
                bool   isset_schema     = false;
                bool   isset_num_rows   = false;
                bool   isset_row_groups = false;
                TField field;
                iprot.ReadStructBegin();
                while (true)
                {
                    field = iprot.ReadFieldBegin();
                    if (field.Type == TType.Stop)
                    {
                        break;
                    }
                    switch (field.ID)
                    {
                    case 1:
                        if (field.Type == TType.I32)
                        {
                            Version       = iprot.ReadI32();
                            isset_version = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 2:
                        if (field.Type == TType.List)
                        {
                            {
                                Schema = new List <SchemaElement>();
                                TList _list24 = iprot.ReadListBegin();
                                for (int _i25 = 0; _i25 < _list24.Count; ++_i25)
                                {
                                    SchemaElement _elem26;
                                    _elem26 = new SchemaElement();
                                    _elem26.Read(iprot);
                                    Schema.Add(_elem26);
                                }
                                iprot.ReadListEnd();
                            }
                            isset_schema = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 3:
                        if (field.Type == TType.I64)
                        {
                            Num_rows       = iprot.ReadI64();
                            isset_num_rows = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 4:
                        if (field.Type == TType.List)
                        {
                            {
                                Row_groups = new List <RowGroup>();
                                TList _list27 = iprot.ReadListBegin();
                                for (int _i28 = 0; _i28 < _list27.Count; ++_i28)
                                {
                                    RowGroup _elem29;
                                    _elem29 = new RowGroup();
                                    _elem29.Read(iprot);
                                    Row_groups.Add(_elem29);
                                }
                                iprot.ReadListEnd();
                            }
                            isset_row_groups = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 5:
                        if (field.Type == TType.List)
                        {
                            {
                                Key_value_metadata = new List <KeyValue>();
                                TList _list30 = iprot.ReadListBegin();
                                for (int _i31 = 0; _i31 < _list30.Count; ++_i31)
                                {
                                    KeyValue _elem32;
                                    _elem32 = new KeyValue();
                                    _elem32.Read(iprot);
                                    Key_value_metadata.Add(_elem32);
                                }
                                iprot.ReadListEnd();
                            }
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 6:
                        if (field.Type == TType.String)
                        {
                            Created_by = iprot.ReadString();
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    default:
                        TProtocolUtil.Skip(iprot, field.Type);
                        break;
                    }
                    iprot.ReadFieldEnd();
                }
                iprot.ReadStructEnd();
                if (!isset_version)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_schema)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_num_rows)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_row_groups)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
            }
            finally
            {
                iprot.DecrementRecursionDepth();
            }
        }
Пример #8
0
        public void Read(TProtocol iprot)
        {
            iprot.IncrementRecursionDepth();
            try
            {
                bool   isset_version    = false;
                bool   isset_schema     = false;
                bool   isset_num_rows   = false;
                bool   isset_row_groups = false;
                TField field;
                iprot.ReadStructBegin();
                while (true)
                {
                    field = iprot.ReadFieldBegin();
                    if (field.Type == TType.Stop)
                    {
                        break;
                    }
                    switch (field.ID)
                    {
                    case 1:
                        if (field.Type == TType.I32)
                        {
                            Version       = iprot.ReadI32();
                            isset_version = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 2:
                        if (field.Type == TType.List)
                        {
                            {
                                Schema = new List <SchemaElement>();
                                TList _list48 = iprot.ReadListBegin();
                                for (int _i49 = 0; _i49 < _list48.Count; ++_i49)
                                {
                                    SchemaElement _elem50;
                                    _elem50 = new SchemaElement();
                                    _elem50.Read(iprot);
                                    Schema.Add(_elem50);
                                }
                                iprot.ReadListEnd();
                            }
                            isset_schema = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 3:
                        if (field.Type == TType.I64)
                        {
                            Num_rows       = iprot.ReadI64();
                            isset_num_rows = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 4:
                        if (field.Type == TType.List)
                        {
                            {
                                Row_groups = new List <RowGroup>();
                                TList _list51 = iprot.ReadListBegin();
                                for (int _i52 = 0; _i52 < _list51.Count; ++_i52)
                                {
                                    RowGroup _elem53;
                                    _elem53 = new RowGroup();
                                    _elem53.Read(iprot);
                                    Row_groups.Add(_elem53);
                                }
                                iprot.ReadListEnd();
                            }
                            isset_row_groups = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 5:
                        if (field.Type == TType.List)
                        {
                            {
                                Key_value_metadata = new List <KeyValue>();
                                TList _list54 = iprot.ReadListBegin();
                                for (int _i55 = 0; _i55 < _list54.Count; ++_i55)
                                {
                                    KeyValue _elem56;
                                    _elem56 = new KeyValue();
                                    _elem56.Read(iprot);
                                    Key_value_metadata.Add(_elem56);
                                }
                                iprot.ReadListEnd();
                            }
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 6:
                        if (field.Type == TType.String)
                        {
                            Created_by = iprot.ReadString();
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 7:
                        if (field.Type == TType.List)
                        {
                            {
                                Column_orders = new List <ColumnOrder>();
                                TList _list57 = iprot.ReadListBegin();
                                for (int _i58 = 0; _i58 < _list57.Count; ++_i58)
                                {
                                    ColumnOrder _elem59;
                                    _elem59 = new ColumnOrder();
                                    _elem59.Read(iprot);
                                    Column_orders.Add(_elem59);
                                }
                                iprot.ReadListEnd();
                            }
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 8:
                        if (field.Type == TType.Struct)
                        {
                            Encryption_algorithm = new EncryptionAlgorithm();
                            Encryption_algorithm.Read(iprot);
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 9:
                        if (field.Type == TType.String)
                        {
                            Footer_signing_key_metadata = iprot.ReadBinary();
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    default:
                        TProtocolUtil.Skip(iprot, field.Type);
                        break;
                    }
                    iprot.ReadFieldEnd();
                }
                iprot.ReadStructEnd();
                if (!isset_version)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_schema)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_num_rows)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_row_groups)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
            }
            finally
            {
                iprot.DecrementRecursionDepth();
            }
        }
Пример #9
0
        public async Task ReadAsync(TProtocol iprot, CancellationToken cancellationToken)
        {
            iprot.IncrementRecursionDepth();
            try
            {
                bool   isset_version    = false;
                bool   isset_schema     = false;
                bool   isset_num_rows   = false;
                bool   isset_row_groups = false;
                TField field;
                await iprot.ReadStructBeginAsync(cancellationToken);

                while (true)
                {
                    field = await iprot.ReadFieldBeginAsync(cancellationToken);

                    if (field.Type == TType.Stop)
                    {
                        break;
                    }

                    switch (field.ID)
                    {
                    case 1:
                        if (field.Type == TType.I32)
                        {
                            Version = await iprot.ReadI32Async(cancellationToken);

                            isset_version = true;
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    case 2:
                        if (field.Type == TType.List)
                        {
                            {
                                Schema = new List <SchemaElement>();
                                TList _list44 = await iprot.ReadListBeginAsync(cancellationToken);

                                for (int _i45 = 0; _i45 < _list44.Count; ++_i45)
                                {
                                    SchemaElement _elem46;
                                    _elem46 = new SchemaElement();
                                    await _elem46.ReadAsync(iprot, cancellationToken);

                                    Schema.Add(_elem46);
                                }
                                await iprot.ReadListEndAsync(cancellationToken);
                            }
                            isset_schema = true;
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    case 3:
                        if (field.Type == TType.I64)
                        {
                            Num_rows = await iprot.ReadI64Async(cancellationToken);

                            isset_num_rows = true;
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    case 4:
                        if (field.Type == TType.List)
                        {
                            {
                                Row_groups = new List <RowGroup>();
                                TList _list47 = await iprot.ReadListBeginAsync(cancellationToken);

                                for (int _i48 = 0; _i48 < _list47.Count; ++_i48)
                                {
                                    RowGroup _elem49;
                                    _elem49 = new RowGroup();
                                    await _elem49.ReadAsync(iprot, cancellationToken);

                                    Row_groups.Add(_elem49);
                                }
                                await iprot.ReadListEndAsync(cancellationToken);
                            }
                            isset_row_groups = true;
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    case 5:
                        if (field.Type == TType.List)
                        {
                            {
                                Key_value_metadata = new List <KeyValue>();
                                TList _list50 = await iprot.ReadListBeginAsync(cancellationToken);

                                for (int _i51 = 0; _i51 < _list50.Count; ++_i51)
                                {
                                    KeyValue _elem52;
                                    _elem52 = new KeyValue();
                                    await _elem52.ReadAsync(iprot, cancellationToken);

                                    Key_value_metadata.Add(_elem52);
                                }
                                await iprot.ReadListEndAsync(cancellationToken);
                            }
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    case 6:
                        if (field.Type == TType.String)
                        {
                            Created_by = await iprot.ReadStringAsync(cancellationToken);
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    case 7:
                        if (field.Type == TType.List)
                        {
                            {
                                Column_orders = new List <ColumnOrder>();
                                TList _list53 = await iprot.ReadListBeginAsync(cancellationToken);

                                for (int _i54 = 0; _i54 < _list53.Count; ++_i54)
                                {
                                    ColumnOrder _elem55;
                                    _elem55 = new ColumnOrder();
                                    await _elem55.ReadAsync(iprot, cancellationToken);

                                    Column_orders.Add(_elem55);
                                }
                                await iprot.ReadListEndAsync(cancellationToken);
                            }
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    default:
                        await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);

                        break;
                    }

                    await iprot.ReadFieldEndAsync(cancellationToken);
                }

                await iprot.ReadStructEndAsync(cancellationToken);

                if (!isset_version)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_schema)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_num_rows)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_row_groups)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
            }
            finally
            {
                iprot.DecrementRecursionDepth();
            }
        }