/// <summary> /// Writes table to this row group /// </summary> /// <param name="writer"></param> /// <param name="table"></param> public static void Write(this ParquetRowGroupWriter writer, Table table) { foreach (DataColumn dc in table.ExtractDataColumns()) { writer.WriteColumn(dc); } }
/// <summary> /// Serialises a collection of classes into a Parquet stream /// </summary> /// <typeparam name="T">Class type</typeparam> /// <param name="objectInstances">Collection of classes</param> /// <param name="destination">Destination stream</param> /// <param name="schema">Optional schema to use. When not specified the class schema will be discovered and everything possible will be /// written to the stream. If you want to write only a subset of class properties please specify the schema yourself. /// </param> /// <param name="compressionMethod"><see cref="CompressionMethod"/></param> /// <param name="rowGroupSize"></param> /// <param name="append"></param> /// <returns></returns> public static Schema Serialize <T>(IEnumerable <T> objectInstances, Stream destination, Schema schema = null, CompressionMethod compressionMethod = CompressionMethod.Snappy, int rowGroupSize = 5000, bool append = false) where T : new() { if (objectInstances == null) { throw new ArgumentNullException(nameof(objectInstances)); } if (destination == null) { throw new ArgumentNullException(nameof(destination)); } if (!destination.CanWrite) { throw new ArgumentException("stream must be writeable", nameof(destination)); } //if schema is not passed reflect it if (schema == null) { schema = SchemaReflector.Reflect <T>(); } using (var writer = new ParquetWriter(schema, destination, append: append)) { writer.CompressionMethod = compressionMethod; DataField[] dataFields = schema.GetDataFields(); foreach (IEnumerable <T> batch in objectInstances.Batch(rowGroupSize)) { var bridge = new ClrBridge(typeof(T)); T[] batchArray = batch.ToArray(); DataColumn[] columns = dataFields .Select(df => bridge.BuildColumn(df, batchArray, batchArray.Length)) .ToArray(); using (ParquetRowGroupWriter groupWriter = writer.CreateRowGroup()) { foreach (DataColumn dataColumn in columns) { groupWriter.WriteColumn(dataColumn); } } } } return(schema); }
/// <summary> /// Writes a file with a single row group /// </summary> public static void WriteSingleRowGroupParquetFile(this Stream stream, Schema schema, params DataColumn[] columns) { using (var writer = new ParquetWriter(schema, stream)) { writer.CompressionMethod = CompressionMethod.None; using (ParquetRowGroupWriter rgw = writer.CreateRowGroup()) { foreach (DataColumn column in columns) { rgw.WriteColumn(column); } } } }