Beispiel #1
0
 /// <summary>
 /// Reads entire file as a table
 /// </summary>
 public static Table ReadTableFromFile(string filePath, ParquetOptions parquetOptions = null)
 {
     using (ParquetReader reader = OpenFromFile(filePath, parquetOptions))
     {
         return(reader.ReadAsTable());
     }
 }
Beispiel #2
0
 /// <summary>
 /// Reads <see cref="DataSet"/> from an open stream
 /// </summary>
 /// <param name="source">Input stream</param>
 /// <param name="formatOptions">Parquet options, optional.</param>
 /// <param name="readerOptions">Reader options, optional</param>
 /// <returns><see cref="DataSet"/></returns>
 public static DataSet Read(Stream source, ParquetOptions formatOptions = null, ReaderOptions readerOptions = null)
 {
     using (var reader = new ParquetReader(source, formatOptions, readerOptions))
     {
         return(reader.Read());
     }
 }
Beispiel #3
0
 /// <summary>
 /// Reads entire stream as a table
 /// </summary>
 public static Table ReadTableFromStream(Stream stream, ParquetOptions parquetOptions = null)
 {
     using (var reader = new ParquetReader(stream, parquetOptions))
     {
         return(reader.ReadAsTable());
     }
 }
Beispiel #4
0
        /// <summary>
        ///
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="input"></param>
        /// <returns></returns>
        public static IEnumerable <T[]> DeserializeGroups <T>(Stream input) where T : new()
        {
            var bridge = new ClrBridge(typeof(T));

            using (var reader = new ParquetReader(input))
            {
                Schema      fileSchema = reader.Schema;
                DataField[] dataFields = fileSchema.GetDataFields();

                for (int i = 0; i < reader.RowGroupCount; i++)
                {
                    using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(i))
                    {
                        DataColumn[] groupColumns = dataFields
                                                    .Select(df => groupReader.ReadColumn(df))
                                                    .ToArray();

                        T[] rb = new T[groupReader.RowCount];
                        for (int ie = 0; ie < rb.Length; ie++)
                        {
                            rb[ie] = new T();
                        }

                        for (int ic = 0; ic < groupColumns.Length; ic++)
                        {
                            bridge.AssignColumn(groupColumns[ic], rb);
                        }

                        yield return(rb);
                    }
                }
            }
        }
Beispiel #5
0
        /// <summary>
        ///
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="input"></param>
        /// <param name="rowGroupIndex"></param>
        /// <returns></returns>
        public static T[] Deserialize <T>(Stream input, int rowGroupIndex = -1) where T : new()
        {
            var result = new List <T>();

            using (var reader = new ParquetReader(input))
            {
                Schema      fileSchema = reader.Schema;
                DataField[] dataFields = fileSchema.GetDataFields();

                if (rowGroupIndex == -1) //Means read all row groups.
                {
                    for (int i = 0; i < reader.RowGroupCount; i++)
                    {
                        T[] currentRowGroupRecords = ReadAndDeserializeByRowGroup <T>(i, reader, dataFields);
                        result.AddRange(currentRowGroupRecords);
                    }
                }
                else //read specific rowgroup.
                {
                    T[] currentRowGroupRecords = ReadAndDeserializeByRowGroup <T>(rowGroupIndex, reader, dataFields);
                    result.AddRange(currentRowGroupRecords);
                }
            }
            return(result.ToArray());
        }
        /// <summary>
        /// Reads the first row group as a table
        /// </summary>
        /// <param name="reader">Open reader</param>
        /// <returns></returns>
        public static Table ReadAsTable(this ParquetReader reader)
        {
            Table result = null;

            for (int i = 0; i < reader.RowGroupCount; i++)
            {
                using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(i))
                {
                    DataColumn[] allData = reader.Schema.GetDataFields().Select(df => rowGroupReader.ReadColumn(df)).ToArray();

                    var t = new Table(reader.Schema, allData, rowGroupReader.RowCount);

                    if (result == null)
                    {
                        result = t;
                    }
                    else
                    {
                        foreach (Row row in t)
                        {
                            result.Add(row);
                        }
                    }
                }
            }

            return(result);
        }
Beispiel #7
0
 /// <summary>
 /// Reads the file
 /// </summary>
 /// <param name="fullPath">The full path.</param>
 /// <param name="formatOptions">Optional reader options.</param>
 /// <param name="readerOptions">The reader options.</param>
 /// <returns><see cref="DataSet"/></returns>
 public static DataSet ReadFile(string fullPath, ParquetOptions formatOptions = null, ReaderOptions readerOptions = null)
 {
     using (Stream fs = System.IO.File.OpenRead(fullPath))
     {
         using (var reader = new ParquetReader(fs, formatOptions, readerOptions))
         {
             return(reader.Read());
         }
     }
 }
        /// <summary>
        /// Reads the first row group from a file
        /// </summary>
        /// <param name="stream"></param>
        /// <param name="schema"></param>
        /// <param name="columns"></param>
        public static void ReadSingleRowGroupParquetFile(this Stream stream, out Schema schema, out DataColumn[] columns)
        {
            using (var reader = new ParquetReader(stream))
            {
                schema = reader.Schema;

                using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0))
                {
                    DataField[] dataFields = schema.GetDataFields();
                    columns = new DataColumn[dataFields.Length];

                    for (int i = 0; i < dataFields.Length; i++)
                    {
                        columns[i] = rgr.ReadColumn(dataFields[i]);
                    }
                }
            }
        }
Beispiel #9
0
        private static T[] ReadAndDeserializeByRowGroup <T>(int rowGroupIndex, ParquetReader reader, DataField[] dataFields) where T : new()
        {
            var bridge = new ClrBridge(typeof(T));

            using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(rowGroupIndex))
            {
                DataColumn[] groupColumns = dataFields
                                            .Select(df => groupReader.ReadColumn(df))
                                            .ToArray();

                T[] rb = new T[groupReader.RowCount];
                for (int ie = 0; ie < rb.Length; ie++)
                {
                    rb[ie] = new T();
                }

                for (int ic = 0; ic < groupColumns.Length; ic++)
                {
                    bridge.AssignColumn(groupColumns[ic], rb);
                }
                return(rb);
            }
        }