/// <summary> /// Reads entire file as a table /// </summary> public static Table ReadTableFromFile(string filePath, ParquetOptions parquetOptions = null) { using (ParquetReader reader = OpenFromFile(filePath, parquetOptions)) { return(reader.ReadAsTable()); } }
/// <summary> /// Reads <see cref="DataSet"/> from an open stream /// </summary> /// <param name="source">Input stream</param> /// <param name="formatOptions">Parquet options, optional.</param> /// <param name="readerOptions">Reader options, optional</param> /// <returns><see cref="DataSet"/></returns> public static DataSet Read(Stream source, ParquetOptions formatOptions = null, ReaderOptions readerOptions = null) { using (var reader = new ParquetReader(source, formatOptions, readerOptions)) { return(reader.Read()); } }
/// <summary> /// Reads entire stream as a table /// </summary> public static Table ReadTableFromStream(Stream stream, ParquetOptions parquetOptions = null) { using (var reader = new ParquetReader(stream, parquetOptions)) { return(reader.ReadAsTable()); } }
/// <summary> /// /// </summary> /// <typeparam name="T"></typeparam> /// <param name="input"></param> /// <returns></returns> public static IEnumerable <T[]> DeserializeGroups <T>(Stream input) where T : new() { var bridge = new ClrBridge(typeof(T)); using (var reader = new ParquetReader(input)) { Schema fileSchema = reader.Schema; DataField[] dataFields = fileSchema.GetDataFields(); for (int i = 0; i < reader.RowGroupCount; i++) { using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(i)) { DataColumn[] groupColumns = dataFields .Select(df => groupReader.ReadColumn(df)) .ToArray(); T[] rb = new T[groupReader.RowCount]; for (int ie = 0; ie < rb.Length; ie++) { rb[ie] = new T(); } for (int ic = 0; ic < groupColumns.Length; ic++) { bridge.AssignColumn(groupColumns[ic], rb); } yield return(rb); } } } }
/// <summary> /// /// </summary> /// <typeparam name="T"></typeparam> /// <param name="input"></param> /// <param name="rowGroupIndex"></param> /// <returns></returns> public static T[] Deserialize <T>(Stream input, int rowGroupIndex = -1) where T : new() { var result = new List <T>(); using (var reader = new ParquetReader(input)) { Schema fileSchema = reader.Schema; DataField[] dataFields = fileSchema.GetDataFields(); if (rowGroupIndex == -1) //Means read all row groups. { for (int i = 0; i < reader.RowGroupCount; i++) { T[] currentRowGroupRecords = ReadAndDeserializeByRowGroup <T>(i, reader, dataFields); result.AddRange(currentRowGroupRecords); } } else //read specific rowgroup. { T[] currentRowGroupRecords = ReadAndDeserializeByRowGroup <T>(rowGroupIndex, reader, dataFields); result.AddRange(currentRowGroupRecords); } } return(result.ToArray()); }
/// <summary> /// Reads the first row group as a table /// </summary> /// <param name="reader">Open reader</param> /// <returns></returns> public static Table ReadAsTable(this ParquetReader reader) { Table result = null; for (int i = 0; i < reader.RowGroupCount; i++) { using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(i)) { DataColumn[] allData = reader.Schema.GetDataFields().Select(df => rowGroupReader.ReadColumn(df)).ToArray(); var t = new Table(reader.Schema, allData, rowGroupReader.RowCount); if (result == null) { result = t; } else { foreach (Row row in t) { result.Add(row); } } } } return(result); }
/// <summary> /// Reads the file /// </summary> /// <param name="fullPath">The full path.</param> /// <param name="formatOptions">Optional reader options.</param> /// <param name="readerOptions">The reader options.</param> /// <returns><see cref="DataSet"/></returns> public static DataSet ReadFile(string fullPath, ParquetOptions formatOptions = null, ReaderOptions readerOptions = null) { using (Stream fs = System.IO.File.OpenRead(fullPath)) { using (var reader = new ParquetReader(fs, formatOptions, readerOptions)) { return(reader.Read()); } } }
/// <summary> /// Reads the first row group from a file /// </summary> /// <param name="stream"></param> /// <param name="schema"></param> /// <param name="columns"></param> public static void ReadSingleRowGroupParquetFile(this Stream stream, out Schema schema, out DataColumn[] columns) { using (var reader = new ParquetReader(stream)) { schema = reader.Schema; using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0)) { DataField[] dataFields = schema.GetDataFields(); columns = new DataColumn[dataFields.Length]; for (int i = 0; i < dataFields.Length; i++) { columns[i] = rgr.ReadColumn(dataFields[i]); } } } }
private static T[] ReadAndDeserializeByRowGroup <T>(int rowGroupIndex, ParquetReader reader, DataField[] dataFields) where T : new() { var bridge = new ClrBridge(typeof(T)); using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(rowGroupIndex)) { DataColumn[] groupColumns = dataFields .Select(df => groupReader.ReadColumn(df)) .ToArray(); T[] rb = new T[groupReader.RowCount]; for (int ie = 0; ie < rb.Length; ie++) { rb[ie] = new T(); } for (int ic = 0; ic < groupColumns.Length; ic++) { bridge.AssignColumn(groupColumns[ic], rb); } return(rb); } }