예제 #1
0
        /// <summary>
        /// Writes next data column to parquet stream. Note that columns must be written in the order they are declared in the
        /// file schema.
        /// </summary>
        /// <param name="column"></param>
        public void WriteColumn(DataColumn column)
        {
            if (column == null)
            {
                throw new ArgumentNullException(nameof(column));
            }

            if (RowCount == null)
            {
                if (column.Data.Length > 0 || column.Field.MaxRepetitionLevel == 0)
                {
                    RowCount = column.CalculateRowCount();
                }
            }

            Thrift.SchemaElement tse = _thschema[_colIdx];
            if (!column.Field.Equals(tse))
            {
                throw new ArgumentException($"cannot write this column, expected '{tse.Name}', passed: '{column.Field.Name}'", nameof(column));
            }
            IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions);

            _colIdx += 1;

            List <string> path = _footer.GetPath(tse);

            var writer = new DataColumnWriter(_stream, _thriftStream, _footer, tse,
                                              _compressionMethod, _compressionLevel,
                                              (int)(RowCount ?? 0));

            Thrift.ColumnChunk chunk = writer.Write(path, column, dataTypeHandler);
            _thriftRowGroup.Columns.Add(chunk);
        }
예제 #2
0
        /// <summary>
        /// Write out dataset to the output stream
        /// </summary>
        /// <param name="dataSet">Dataset to write</param>
        /// <param name="compression">Compression method</param>
        /// <param name="append">When true, appends to the file, otherwise creates a new file.</param>
        public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false)
        {
            PrepareFile(dataSet, append);

            int offset = 0;
            int count;

            do
            {
                count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset);
                Thrift.RowGroup rg         = _meta.AddRowGroup();
                long            rgStartPos = Stream.Position;

                rg.Columns = new List <Thrift.ColumnChunk>();
                foreach (SchemaElement se in dataSet.Schema.Flatten())
                {
                    var   cw                 = new ColumnWriter(Stream, ThriftStream, _meta, se, compression, _formatOptions, _writerOptions);
                    IList values             = dataSet.GetColumn(se, offset, count);
                    Thrift.ColumnChunk chunk = cw.Write(offset, count, values);
                    rg.Columns.Add(chunk);
                }

                //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers
                //luckily ColumnChunk already contains sizes of page+header in it's meta
                rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size);
                rg.Num_rows        = count;

                offset += _writerOptions.RowGroupsSize;
            }while (offset < dataSet.Count);

            _dataWritten = true;
        }
예제 #3
0
        private Thrift.ColumnChunk Write(SchemaElement schema, IList values,
                                         CompressionMethod compression,
                                         ColumnStats stats)
        {
            Thrift.ColumnChunk chunk = _meta.AddColumnChunk(compression, _output, schema, values.Count);
            Thrift.PageHeader  ph    = _meta.CreateDataPage(values.Count);

            List <PageTag> pages = WriteValues(schema, values, ph, compression, stats);

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
예제 #4
0
        private Thrift.ColumnChunk Write(SchemaElement schema, IList values,
                                         CompressionMethod compression,
                                         ColumnStats stats)
        {
            Thrift.ColumnChunk chunk = _meta.AddColumnChunk(compression, _output, schema, values.Count);

            var ph = new Thrift.PageHeader(Thrift.PageType.DATA_PAGE, 0, 0);

            ph.Data_page_header = new Thrift.DataPageHeader
            {
                Encoding = Thrift.Encoding.PLAIN,
                Definition_level_encoding = Thrift.Encoding.RLE,
                Repetition_level_encoding = Thrift.Encoding.BIT_PACKED,
                Num_values = values.Count
            };

            WriteValues(schema, values, ph, compression, stats);

            return(chunk);
        }
예제 #5
0
        /// <summary>
        /// Write out dataset to the output stream
        /// </summary>
        /// <param name="dataSet">Dataset to write</param>
        /// <param name="compression">Compression method</param>
        /// <param name="append">When true, appends to the file, otherwise creates a new file.</param>
        public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false)
        {
            PrepareFile(dataSet, append);
            _footer.CustomMetadata = dataSet.Metadata.Custom;

            int offset = 0;
            int count;
            List <Thrift.SchemaElement> writeableSchema = _footer.GetWriteableSchema().ToList();

            do
            {
                count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset);
                Thrift.RowGroup rg         = _footer.AddRowGroup();
                long            rgStartPos = Stream.Position;

                rg.Columns = new List <Thrift.ColumnChunk>();

                foreach (Thrift.SchemaElement tse in writeableSchema)
                {
                    List <string> path     = _footer.GetPath(tse);
                    string        flatPath = string.Join(Schema.PathSeparator, path);
                    var           cw       = new ColumnarWriter(Stream, ThriftStream, _footer, tse, path, compression, _formatOptions, _writerOptions);

                    IList values             = dataSet.GetColumn(flatPath, offset, count);
                    Thrift.ColumnChunk chunk = cw.Write(offset, count, values);
                    rg.Columns.Add(chunk);
                }

                //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers
                //luckily ColumnChunk already contains sizes of page+header in it's meta
                rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size);
                rg.Num_rows        = count;

                offset += _writerOptions.RowGroupsSize;
            }while (offset < dataSet.Count);

            _dataWritten = true;
        }
예제 #6
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var    metaParser = new FileMetadataParser(_meta);
            Schema schema     = metaParser.ParseSchema(_formatOptions);

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc = rg.Columns[icol];
                    SchemaElement      se = schema[cc];

                    var p = new ColumnReader(cc, se, _input, ThriftStream, _formatOptions);

                    try
                    {
                        IList chunkValues = p.Read(offset, count);

                        if (!pathToValues.TryGetValue(se.Path, out IList allValues))
                        {
                            pathToValues[se.Path] = chunkValues;
                        }
                        else
                        {
                            allValues.AddRange(chunkValues);
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{se}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            return(new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by));
        }
예제 #7
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var footer = new ThriftFooter(_meta);

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc   = rg.Columns[icol];
                    string             path = cc.GetPath();
                    if (_fieldPredicates != null && !_fieldPredicates.Any(p => p.IsMatch(cc, path)))
                    {
                        continue;
                    }

                    var columnarReader = new ColumnarReader(_input, cc, footer, _formatOptions);

                    try
                    {
                        IList chunkValues = columnarReader.Read(offset, count);

                        if (!pathToValues.TryGetValue(path, out IList allValues))
                        {
                            pathToValues[path] = chunkValues;
                        }
                        else
                        {
                            foreach (object v in chunkValues)
                            {
                                allValues.Add(v);
                            }
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{path}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            Schema schema = footer.CreateModelSchema(_formatOptions);

            schema = schema.Filter(_fieldPredicates);
            var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by);
            Dictionary <string, string> customMetadata = footer.CustomMetadata;

            if (customMetadata != null)
            {
                ds.Metadata.Custom.AddRange(customMetadata);
            }
            ds.Thrift = _meta;
            return(ds);
        }
예제 #8
0
        public async Task ReadAsync(TProtocol iprot, CancellationToken cancellationToken)
        {
            iprot.IncrementRecursionDepth();
            try
            {
                bool   isset_columns         = false;
                bool   isset_total_byte_size = false;
                bool   isset_num_rows        = false;
                TField field;
                await iprot.ReadStructBeginAsync(cancellationToken);

                while (true)
                {
                    field = await iprot.ReadFieldBeginAsync(cancellationToken);

                    if (field.Type == TType.Stop)
                    {
                        break;
                    }

                    switch (field.ID)
                    {
                    case 1:
                        if (field.Type == TType.List)
                        {
                            {
                                Columns = new List <ColumnChunk>();
                                TList _list16 = await iprot.ReadListBeginAsync(cancellationToken);

                                for (int _i17 = 0; _i17 < _list16.Count; ++_i17)
                                {
                                    ColumnChunk _elem18;
                                    _elem18 = new ColumnChunk();
                                    await _elem18.ReadAsync(iprot, cancellationToken);

                                    Columns.Add(_elem18);
                                }
                                await iprot.ReadListEndAsync(cancellationToken);
                            }
                            isset_columns = true;
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    case 2:
                        if (field.Type == TType.I64)
                        {
                            Total_byte_size = await iprot.ReadI64Async(cancellationToken);

                            isset_total_byte_size = true;
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    case 3:
                        if (field.Type == TType.I64)
                        {
                            Num_rows = await iprot.ReadI64Async(cancellationToken);

                            isset_num_rows = true;
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    case 4:
                        if (field.Type == TType.List)
                        {
                            {
                                Sorting_columns = new List <SortingColumn>();
                                TList _list19 = await iprot.ReadListBeginAsync(cancellationToken);

                                for (int _i20 = 0; _i20 < _list19.Count; ++_i20)
                                {
                                    SortingColumn _elem21;
                                    _elem21 = new SortingColumn();
                                    await _elem21.ReadAsync(iprot, cancellationToken);

                                    Sorting_columns.Add(_elem21);
                                }
                                await iprot.ReadListEndAsync(cancellationToken);
                            }
                        }
                        else
                        {
                            await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);
                        }
                        break;

                    default:
                        await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken);

                        break;
                    }

                    await iprot.ReadFieldEndAsync(cancellationToken);
                }

                await iprot.ReadStructEndAsync(cancellationToken);

                if (!isset_columns)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_total_byte_size)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_num_rows)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
            }
            finally
            {
                iprot.DecrementRecursionDepth();
            }
        }
        /// <summary>
        /// Decodes raw bytes from <see cref="Thrift.Statistics"/> into a CLR value
        /// </summary>
        public static object DecodeSingleStatsValue(this Thrift.FileMetaData fileMeta, Thrift.ColumnChunk columnChunk, byte[] rawBytes)
        {
            if (rawBytes == null || rawBytes.Length == 0)
            {
                return(null);
            }

            var footer = new ThriftFooter(fileMeta);

            Thrift.SchemaElement schema = footer.GetSchemaElement(columnChunk);

            IDataTypeHandler handler = DataTypeFactory.Match(schema, new ParquetOptions {
                TreatByteArrayAsString = true
            });

            using (var ms = new MemoryStream(rawBytes))
                using (var reader = new BinaryReader(ms))
                {
                    object value = handler.Read(reader, schema, rawBytes.Length);
                    return(value);
                }
        }
예제 #10
0
 public static string GetPath(this Thrift.ColumnChunk columnChunk)
 {
     return(string.Join(Schema.PathSeparator, columnChunk.Meta_data.Path_in_schema));
 }
예제 #11
0
        public void Read(TProtocol iprot)
        {
            iprot.IncrementRecursionDepth();
            try
            {
                bool   isset_columns         = false;
                bool   isset_total_byte_size = false;
                bool   isset_num_rows        = false;
                TField field;
                iprot.ReadStructBegin();
                while (true)
                {
                    field = iprot.ReadFieldBegin();
                    if (field.Type == TType.Stop)
                    {
                        break;
                    }
                    switch (field.ID)
                    {
                    case 1:
                        if (field.Type == TType.List)
                        {
                            {
                                Columns = new List <ColumnChunk>();
                                TList _list20 = iprot.ReadListBegin();
                                for (int _i21 = 0; _i21 < _list20.Count; ++_i21)
                                {
                                    ColumnChunk _elem22;
                                    _elem22 = new ColumnChunk();
                                    _elem22.Read(iprot);
                                    Columns.Add(_elem22);
                                }
                                iprot.ReadListEnd();
                            }
                            isset_columns = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 2:
                        if (field.Type == TType.I64)
                        {
                            Total_byte_size       = iprot.ReadI64();
                            isset_total_byte_size = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 3:
                        if (field.Type == TType.I64)
                        {
                            Num_rows       = iprot.ReadI64();
                            isset_num_rows = true;
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 4:
                        if (field.Type == TType.List)
                        {
                            {
                                Sorting_columns = new List <SortingColumn>();
                                TList _list23 = iprot.ReadListBegin();
                                for (int _i24 = 0; _i24 < _list23.Count; ++_i24)
                                {
                                    SortingColumn _elem25;
                                    _elem25 = new SortingColumn();
                                    _elem25.Read(iprot);
                                    Sorting_columns.Add(_elem25);
                                }
                                iprot.ReadListEnd();
                            }
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 5:
                        if (field.Type == TType.I64)
                        {
                            File_offset = iprot.ReadI64();
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 6:
                        if (field.Type == TType.I64)
                        {
                            Total_compressed_size = iprot.ReadI64();
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    case 7:
                        if (field.Type == TType.I16)
                        {
                            Ordinal = iprot.ReadI16();
                        }
                        else
                        {
                            TProtocolUtil.Skip(iprot, field.Type);
                        }
                        break;

                    default:
                        TProtocolUtil.Skip(iprot, field.Type);
                        break;
                    }
                    iprot.ReadFieldEnd();
                }
                iprot.ReadStructEnd();
                if (!isset_columns)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_total_byte_size)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
                if (!isset_num_rows)
                {
                    throw new TProtocolException(TProtocolException.INVALID_DATA);
                }
            }
            finally
            {
                iprot.DecrementRecursionDepth();
            }
        }