/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> /// <param name="append">When true, appends to the file, otherwise creates a new file.</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false) { PrepareFile(dataSet, append); var stats = new DataSetStats(dataSet); int offset = 0; int count; do { count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset); Thrift.RowGroup rg = _meta.AddRowGroup(); long rgStartPos = _output.Position; rg.Columns = dataSet.Schema.Elements .Select(c => Write(c, dataSet.GetColumn(c.Name, offset, count), compression, stats.GetColumnStats(c))) .ToList(); //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers //luckily ColumnChunk already contains sizes of page+header in it's meta rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size); rg.Num_rows = count; offset += _writerOptions.RowGroupsSize; }while (offset < dataSet.Count); _dataWritten = true; }
/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> /// <param name="append">When true, appends to the file, otherwise creates a new file.</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false) { PrepareFile(dataSet, append); int offset = 0; int count; do { count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset); Thrift.RowGroup rg = _meta.AddRowGroup(); long rgStartPos = Stream.Position; rg.Columns = new List <Thrift.ColumnChunk>(); foreach (SchemaElement se in dataSet.Schema.Flatten()) { var cw = new ColumnWriter(Stream, ThriftStream, _meta, se, compression, _formatOptions, _writerOptions); IList values = dataSet.GetColumn(se, offset, count); Thrift.ColumnChunk chunk = cw.Write(offset, count, values); rg.Columns.Add(chunk); } //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers //luckily ColumnChunk already contains sizes of page+header in it's meta rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size); rg.Num_rows = count; offset += _writerOptions.RowGroupsSize; }while (offset < dataSet.Count); _dataWritten = true; }