/// <summary> /// Writes next data column to parquet stream. Note that columns must be written in the order they are declared in the /// file schema. /// </summary> /// <param name="column"></param> public void WriteColumn(DataColumn column) { if (column == null) { throw new ArgumentNullException(nameof(column)); } if (RowCount == null) { if (column.Data.Length > 0 || column.Field.MaxRepetitionLevel == 0) { RowCount = column.CalculateRowCount(); } } Thrift.SchemaElement tse = _thschema[_colIdx]; if (!column.Field.Equals(tse)) { throw new ArgumentException($"cannot write this column, expected '{tse.Name}', passed: '{column.Field.Name}'", nameof(column)); } IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions); _colIdx += 1; List <string> path = _footer.GetPath(tse); var writer = new DataColumnWriter(_stream, _thriftStream, _footer, tse, _compressionMethod, _compressionLevel, (int)(RowCount ?? 0)); Thrift.ColumnChunk chunk = writer.Write(path, column, dataTypeHandler); _thriftRowGroup.Columns.Add(chunk); }
/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> /// <param name="append">When true, appends to the file, otherwise creates a new file.</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false) { PrepareFile(dataSet, append); _footer.CustomMetadata = dataSet.Metadata.Custom; int offset = 0; int count; List <Thrift.SchemaElement> writeableSchema = _footer.GetWriteableSchema().ToList(); do { count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset); Thrift.RowGroup rg = _footer.AddRowGroup(); long rgStartPos = Stream.Position; rg.Columns = new List <Thrift.ColumnChunk>(); foreach (Thrift.SchemaElement tse in writeableSchema) { List <string> path = _footer.GetPath(tse); string flatPath = string.Join(Schema.PathSeparator, path); var cw = new ColumnarWriter(Stream, ThriftStream, _footer, tse, path, compression, _formatOptions, _writerOptions); IList values = dataSet.GetColumn(flatPath, offset, count); Thrift.ColumnChunk chunk = cw.Write(offset, count, values); rg.Columns.Add(chunk); } //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers //luckily ColumnChunk already contains sizes of page+header in it's meta rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size); rg.Num_rows = count; offset += _writerOptions.RowGroupsSize; }while (offset < dataSet.Count); _dataWritten = true; }