/// <summary> /// /// </summary> /// <typeparam name="T"></typeparam> /// <param name="input"></param> /// <returns></returns> public static IEnumerable <T[]> DeserializeGroups <T>(Stream input) where T : new() { var bridge = new ClrBridge(typeof(T)); using (var reader = new ParquetReader(input)) { Schema fileSchema = reader.Schema; DataField[] dataFields = fileSchema.GetDataFields(); for (int i = 0; i < reader.RowGroupCount; i++) { using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(i)) { DataColumn[] groupColumns = dataFields .Select(df => groupReader.ReadColumn(df)) .ToArray(); T[] rb = new T[groupReader.RowCount]; for (int ie = 0; ie < rb.Length; ie++) { rb[ie] = new T(); } for (int ic = 0; ic < groupColumns.Length; ic++) { bridge.AssignColumn(groupColumns[ic], rb); } yield return(rb); } } } }
public static IEnumerable <T> Deserialize <T>(Stream input) where T : new() { var result = new List <T>(); IColumnClrMapper mapper = new SlowReflectionColumnClrMapper(typeof(T)); using (var reader = new ParquetReader3(input)) { Schema fileSchema = reader.Schema; List <DataField> dataFields = fileSchema.GetDataFields(); for (int i = 0; i < reader.RowGroupCount; i++) { using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(i)) { List <DataColumn> groupColumns = dataFields .Select(df => groupReader.ReadColumn(df)) .ToList(); IReadOnlyCollection <T> groupClrObjects = mapper.CreateClassInstances <T>(groupColumns); result.AddRange(groupClrObjects); } } } return(result); }
/// <summary> /// Reads entire row group's data columns in one go. /// </summary> /// <param name="rowGroupIndex">Index of the row group. Default to the first row group if not specified.</param> /// <returns></returns> public DataColumn[] ReadEntireRowGroup(int rowGroupIndex = 0) { DataField[] dataFields = Schema.GetDataFields(); DataColumn[] result = new DataColumn[dataFields.Length]; using (ParquetRowGroupReader reader = OpenRowGroupReader(rowGroupIndex)) { for (int i = 0; i < dataFields.Length; i++) { DataColumn column = reader.ReadColumn(dataFields[i]); result[i] = column; } } return(result); }
/// <summary> /// Reads the first row group from a file /// </summary> /// <param name="stream"></param> /// <param name="schema"></param> /// <param name="columns"></param> public static void ReadSingleRowGroupParquetFile(this Stream stream, out Schema schema, out DataColumn[] columns) { using (var reader = new ParquetReader(stream)) { schema = reader.Schema; using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0)) { DataField[] dataFields = schema.GetDataFields(); columns = new DataColumn[dataFields.Length]; for (int i = 0; i < dataFields.Length; i++) { columns[i] = rgr.ReadColumn(dataFields[i]); } } } }
/// <summary> /// Reads an open row group as a Table /// </summary> /// <param name="rowGroupReader">Open row group reader</param> /// <param name="schema">File schema</param> /// <param name="rowCount">Number of rows to include, if NULL all rows will be included</param> /// <returns></returns> public static Table ReadAsTable(this ParquetRowGroupReader rowGroupReader, Schema schema, int?rowCount) { Table result = null; DataColumn[] allData = schema.GetDataFields().Select(df => rowGroupReader.ReadColumn(df)).ToArray(); var t = new Table(schema, allData, rowCount ?? rowGroupReader.RowCount); if (result == null) { result = t; } else { foreach (Row row in t) { result.Add(row); } } return(result); }
private static T[] ReadAndDeserializeByRowGroup <T>(int rowGroupIndex, ParquetReader reader, DataField[] dataFields) where T : new() { var bridge = new ClrBridge(typeof(T)); using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(rowGroupIndex)) { DataColumn[] groupColumns = dataFields .Select(df => groupReader.ReadColumn(df)) .ToArray(); T[] rb = new T[groupReader.RowCount]; for (int ie = 0; ie < rb.Length; ie++) { rb[ie] = new T(); } for (int ic = 0; ic < groupColumns.Length; ic++) { bridge.AssignColumn(groupColumns[ic], rb); } return(rb); } }
/// <summary> /// Reads the first row group as a table /// </summary> /// <param name="reader">Open reader</param> /// <returns></returns> public static Table ReadAsTable(this ParquetReader reader) { Table result = null; for (int i = 0; i < reader.RowGroupCount; i++) { using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(i)) { DataColumn[] allData = reader.Schema.GetDataFields().Select(df => rowGroupReader.ReadColumn(df)).ToArray(); var t = new Table(reader.Schema, allData, rowGroupReader.RowCount); if (result == null) { result = t; } else { foreach (Row row in t) { result.Add(row); } } } } return(result); }