Beispiel #1
0
        /// <summary>
        /// Writes next data column to parquet stream. Note that columns must be written in the order they are declared in the
        /// file schema.
        /// </summary>
        /// <param name="column"></param>
        public void WriteColumn(DataColumn column)
        {
            if (column == null)
            {
                throw new ArgumentNullException(nameof(column));
            }

            if (RowCount == null)
            {
                if (column.Data.Length > 0 || column.Field.MaxRepetitionLevel == 0)
                {
                    RowCount = column.CalculateRowCount();
                }
            }

            Thrift.SchemaElement tse = _thschema[_colIdx];
            if (!column.Field.Equals(tse))
            {
                throw new ArgumentException($"cannot write this column, expected '{tse.Name}', passed: '{column.Field.Name}'", nameof(column));
            }
            IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions);

            _colIdx += 1;

            List <string> path = _footer.GetPath(tse);

            var writer = new DataColumnWriter(_stream, _thriftStream, _footer, tse,
                                              _compressionMethod, _compressionLevel,
                                              (int)(RowCount ?? 0));

            Thrift.ColumnChunk chunk = writer.Write(path, column, dataTypeHandler);
            _thriftRowGroup.Columns.Add(chunk);
        }
Beispiel #2
0
        /// <summary>
        /// Write out dataset to the output stream
        /// </summary>
        /// <param name="dataSet">Dataset to write</param>
        /// <param name="compression">Compression method</param>
        /// <param name="append">When true, appends to the file, otherwise creates a new file.</param>
        public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false)
        {
            PrepareFile(dataSet, append);
            _footer.CustomMetadata = dataSet.Metadata.Custom;

            int offset = 0;
            int count;
            List <Thrift.SchemaElement> writeableSchema = _footer.GetWriteableSchema().ToList();

            do
            {
                count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset);
                Thrift.RowGroup rg         = _footer.AddRowGroup();
                long            rgStartPos = Stream.Position;

                rg.Columns = new List <Thrift.ColumnChunk>();

                foreach (Thrift.SchemaElement tse in writeableSchema)
                {
                    List <string> path     = _footer.GetPath(tse);
                    string        flatPath = string.Join(Schema.PathSeparator, path);
                    var           cw       = new ColumnarWriter(Stream, ThriftStream, _footer, tse, path, compression, _formatOptions, _writerOptions);

                    IList values             = dataSet.GetColumn(flatPath, offset, count);
                    Thrift.ColumnChunk chunk = cw.Write(offset, count, values);
                    rg.Columns.Add(chunk);
                }

                //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers
                //luckily ColumnChunk already contains sizes of page+header in it's meta
                rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size);
                rg.Num_rows        = count;

                offset += _writerOptions.RowGroupsSize;
            }while (offset < dataSet.Count);

            _dataWritten = true;
        }