Пример #1
0
        public void FileMetadata_sets_num_rows_on_file_and_row_group_multiple_row_groups()
        {
            var ms = new MemoryStream();
            var id = new DataField <int>("id");

            //write
            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new[] { 1, 2, 3, 4 }));
                }

                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new[] { 5, 6 }));
                }
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(6, reader.ThriftMetadata.Num_rows);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(4, rg.RowCount);
                }

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1))
                {
                    Assert.Equal(2, rg.RowCount);
                }
            }
        }
Пример #2
0
        public void Write_in_small_row_groups()
        {
            //write a single file having 3 row groups
            var id = new DataField <int>("id");
            var ms = new MemoryStream();

            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 1 }));
                }

                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 2 }));
                }

                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 3 }));
                }
            }

            //read the file back and validate
            ms.Position = 0;
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(3, reader.RowGroupCount);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(1, rg.RowCount);
                    DataColumn dc = rg.ReadColumn(id);
                    Assert.Equal(new int[] { 1 }, dc.Data);
                }

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1))
                {
                    Assert.Equal(1, rg.RowCount);
                    DataColumn dc = rg.ReadColumn(id);
                    Assert.Equal(new int[] { 2 }, dc.Data);
                }

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(2))
                {
                    Assert.Equal(1, rg.RowCount);
                    DataColumn dc = rg.ReadColumn(id);
                    Assert.Equal(new int[] { 3 }, dc.Data);
                }
            }
        }
Пример #3
0
        public static Dictionary <int, string> ReadParquetFile(string infile)
        {
            Dictionary <int, string> serializedRequests = new Dictionary <int, string>();

            string path = Path.GetFullPath(Directory.GetCurrentDirectory() + "/" + infile);

            using (Stream fileStream = File.OpenRead(path))
            {
                using (var parquetReader = new ParquetReader(fileStream))
                {
                    DataField[] dataFields = parquetReader.Schema.GetDataFields();

                    using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(0))
                    {
                        DataColumn[] columns      = dataFields.Select(groupReader.ReadColumn).ToArray();
                        DataColumn   firstColumn  = columns[0];
                        DataColumn   secondColumn = columns[1];

                        Array idData      = firstColumn.Data;
                        Array requestData = secondColumn.Data;

                        for (var j = 0; j < firstColumn.Data.Length; j++)
                        {
                            var convertedRequestData = (string)requestData.GetValue(j);
                            var convertedIdData      = (int)idData.GetValue(j);
                            serializedRequests.Add(convertedIdData, convertedRequestData);
                        }
                    }

                    return(serializedRequests);
                }
            }
        }
Пример #4
0
        public void ReadIntro()
        {
            // open file stream
            using (Stream fileStream = System.IO.File.OpenRead("c:\\test.parquet"))
            {
                // open parquet file reader
                using (var parquetReader = new ParquetReader(fileStream))
                {
                    // get file schema (available straight after opening parquet reader)
                    // however, get only data fields as only they contain data values
                    DataField[] dataFields = parquetReader.Schema.GetDataFields();

                    // enumerate through row groups in this file
                    for (int i = 0; i < parquetReader.RowGroupCount; i++)
                    {
                        // create row group reader
                        using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i))
                        {
                            // read all columns inside each row group (you have an option to read only
                            // required columns if you need to.
                            DataColumn[] columns = dataFields.Select(groupReader.ReadColumn).ToArray();

                            // get first column, for instance
                            DataColumn firstColumn = columns[0];

                            // .Data member contains a typed array of column data you can cast to the type of the column
                            Array data = firstColumn.Data;
                            int[] ids  = (int[])data;
                        }
                    }
                }
            }
        }
Пример #5
0
        public IEnumerable <DataColumn[]> GetData(string file)
        {
            using (Stream fileStream = System.IO.File.OpenRead(file))
            {
                // open parquet file reader
                using (var parquetReader = new ParquetReader(fileStream))
                {
                    // get file schema (available straight after opening parquet reader)
                    // however, get only data fields as only they contain data values
                    DataField[] dataFields = parquetReader.Schema.GetDataFields();

                    // enumerate through row groups in this file
                    for (int i = 0; i < parquetReader.RowGroupCount; i++)
                    {
                        // create row group reader
                        using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i))
                        {
                            // read all columns inside each row group (you have an option to read only
                            // required columns if you need to.
                            yield return(dataFields.Select(groupReader.ReadColumn).ToArray());
                        }
                    }
                }
            }
        }
Пример #6
0
        public void Write_read_nullable_column(Array input)
        {
            var id = new DataField <int?>("id");
            var ms = new MemoryStream();

            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, input));
                }
            }

            ms.Position = 0;
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(1, reader.RowGroupCount);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(input.Length, rg.RowCount);
                    Assert.Equal(input, rg.ReadColumn(id).Data);
                }
            }
        }
Пример #7
0
        public static DataTable ParquetReaderToDataTable(ParquetReader parquetReader, out int totalRecordCount, List <string> selectedFields, int offset, int recordCount)
        {
            //Get list of data fields and construct the DataTable
            DataTable dataTable = new DataTable();
            List <Parquet.Data.DataField> fields = new List <Parquet.Data.DataField>();
            var dataFields = parquetReader.Schema.GetDataFields();

            foreach (string selectedField in selectedFields)
            {
                var dataField = dataFields.FirstOrDefault(f => f.Name.Equals(selectedField, StringComparison.InvariantCultureIgnoreCase));
                if (dataField != null)
                {
                    fields.Add(dataField);
                    DataColumn newColumn = new DataColumn(dataField.Name, ParquetNetTypeToCSharpType(dataField.DataType))
                    {
                        // Should not set this, or line 89 in ProcessRowGroup() will throw an exception with any required field (because assigning later than adding)
                        //AllowDBNull = dataField.HasNulls
                    };
                    dataTable.Columns.Add(newColumn);
                }
                else
                {
                    throw new Exception(string.Format("Field '{0}' does not exist", selectedField));
                }
            }

            //Read column by column to generate each row in the datatable
            totalRecordCount = 0;
            for (int i = 0; i < parquetReader.RowGroupCount; i++)
            {
                int rowsLeftToRead = recordCount;
                using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i))
                {
                    if (groupReader.RowCount > int.MaxValue)
                    {
                        throw new ArgumentOutOfRangeException(string.Format("Cannot handle row group sizes greater than {0}", groupReader.RowCount));
                    }

                    int rowsPassedUntilThisRowGroup = totalRecordCount;
                    totalRecordCount += (int)groupReader.RowCount;

                    if (offset >= totalRecordCount)
                    {
                        continue;
                    }

                    if (rowsLeftToRead > 0)
                    {
                        int numberOfRecordsToReadFromThisRowGroup = Math.Min(Math.Min(totalRecordCount - offset, recordCount), (int)groupReader.RowCount);
                        rowsLeftToRead -= numberOfRecordsToReadFromThisRowGroup;

                        int recordsToSkipInThisRowGroup = Math.Max(offset - rowsPassedUntilThisRowGroup, 0);

                        ProcessRowGroup(dataTable, groupReader, fields, recordsToSkipInThisRowGroup, numberOfRecordsToReadFromThisRowGroup);
                    }
                }
            }

            return(dataTable);
        }
Пример #8
0
        public void List_of_elements_with_some_items_empty_reads_file()
        {
            /*
             * list data:
             * - 1: [1, 2, 3]
             * - 2: []
             * - 3: [1, 2, 3]
             * - 4: []
             */

            using (var reader = new ParquetReader(OpenTestFile("listofitems-empty-alternates.parquet")))
            {
                using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(4, groupReader.RowCount);
                    DataField[] fs = reader.Schema.GetDataFields();

                    DataColumn id = groupReader.ReadColumn(fs[0]);
                    Assert.Equal(4, id.Data.Length);
                    Assert.False(id.HasRepetitions);

                    DataColumn list = groupReader.ReadColumn(fs[1]);
                    Assert.Equal(8, list.Data.Length);
                    Assert.Equal(new int[] { 0, 1, 1, 0, 0, 1, 1, 0 }, list.RepetitionLevels);
                }
            }
        }
Пример #9
0
        private bool LoadRowGroup(int rowGroup)
        {
            using (var reader = _parquetReader.OpenRowGroupReader(rowGroup))
            {
                _rowIndex = 0;
                _rowCount = reader.RowCount;

                var columns = new List <Parquet.Data.DataColumn>();
                foreach (var f in _fields)
                {
                    try
                    {
                        columns.Add(reader.ReadColumn(f));
                    }
                    catch (IndexOutOfRangeException)
                    {
                        // this happens if every single element in the column within the rowgroup is null,
                        // TODO: Figure out how to read the header to detect this without an exception
                        columns.Add(null);
                    }
                }
                _columns = columns;
            }
            return(true);
        }
Пример #10
0
        public static void TestAgainstThirdParty()
        {
            var columns = new Column[] { new Column <decimal>("Decimal", LogicalType.Decimal(precision: 29, scale: 3)) };
            var values  = Enumerable.Range(0, 10_000)
                          .Select(i => ((decimal)i * i * i) / 1000 - 10)
                          .Concat(new [] { decimal.MinValue / 1000, decimal.MaxValue / 1000 })
                          .ToArray();

            using var buffer = new ResizableBuffer();

            // Write using ParquetSharp
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, columns, Compression.Snappy);
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var columnWriter   = rowGroupWriter.NextColumn().LogicalWriter <decimal>();

                columnWriter.WriteBatch(values);

                fileWriter.Close();
            }

            // Read using Parquet.NET
            using var memoryStream   = new MemoryStream(buffer.ToArray());
            using var fileReader     = new ParquetReader(memoryStream);
            using var rowGroupReader = fileReader.OpenRowGroupReader(0);

            var read = (decimal[])rowGroupReader.ReadColumn(fileReader.Schema.GetDataFields()[0]).Data;

            Assert.AreEqual(values, read);
        }
Пример #11
0
        public void Write_multiple_row_groups_to_forward_only_stream()
        {
            var ms          = new MemoryStream();
            var forwardOnly = new WriteableNonSeekableStream(ms);

            var schema = new Schema(
                new DataField <int>("id"),
                new DataField <string>("nonsense"));

            using (var writer = new ParquetWriter(schema, forwardOnly))
            {
                using (ParquetRowGroupWriter rgw = writer.CreateRowGroup(1))
                {
                    rgw.WriteColumn(new DataColumn((DataField)schema[0], new[] { 1 }));
                    rgw.WriteColumn(new DataColumn((DataField)schema[1], new[] { "1" }));
                }

                using (ParquetRowGroupWriter rgw = writer.CreateRowGroup(1))
                {
                    rgw.WriteColumn(new DataColumn((DataField)schema[0], new[] { 2 }));
                    rgw.WriteColumn(new DataColumn((DataField)schema[1], new[] { "2" }));
                }
            }

            ms.Position = 0;
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(2, reader.RowGroupCount);

                using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(1, rgr.RowCount);

                    DataColumn column = rgr.ReadColumn((DataField)schema[0]);
                    Assert.Equal(1, column.Data.GetValue(0));
                }

                using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(1))
                {
                    Assert.Equal(1, rgr.RowCount);

                    DataColumn column = rgr.ReadColumn((DataField)schema[0]);
                    Assert.Equal(2, column.Data.GetValue(0));
                }
            }
        }
Пример #12
0
        public void Append_to_file_reads_all_data()
        {
            //write a file with a single row group
            var id = new DataField <int>("id");
            var ms = new MemoryStream();

            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 1, 2 }));
                }
            }

            //append to this file. Note that you cannot append to existing row group, therefore create a new one
            ms.Position = 0;
            using (var writer = new ParquetWriter(new Schema(id), ms, append: true))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 3, 4 }));
                }
            }

            //check that this file now contains two row groups and all the data is valid
            ms.Position = 0;
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(2, reader.RowGroupCount);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(2, rg.RowCount);
                    Assert.Equal(new int[] { 1, 2 }, rg.ReadColumn(id).Data);
                }

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1))
                {
                    Assert.Equal(2, rg.RowCount);
                    Assert.Equal(new int[] { 3, 4 }, rg.ReadColumn(id).Data);
                }
            }
        }
        public static List <T> ReadParquet <T>(this Stream stream) where T : class, new()
        {
            Type classType = typeof(T);

            List <T> results = new List <T>();

            var properties = classType.GetProperties().ToDictionary(p => p.Name, p => p);

            var bytes = stream.ReadAsBytes().GetAwaiter().GetResult();

            using (ParquetReader reader = new ParquetReader(new MemoryStream(bytes)))
            {
                DataField[] fields = reader.Schema.GetDataFields();
                for (int g = 0; g < reader.RowGroupCount; g++)
                {
                    using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(g))
                    {
                        DataColumn[] columns = fields.Select(rowGroupReader.ReadColumn).ToArray();
                        if (columns.Length > 0)
                        {
                            for (int i = 0; i < columns[0].Data.Length; i++)
                            {
                                T item = new T();
                                foreach (var column in columns)
                                {
                                    var prop = properties[column.Field.Name];
                                    if (column.Field.DataType == ParquetDataType.DateTimeOffset)
                                    {
                                        if (prop.PropertyType == DateTimeType)
                                        {
                                            prop.SetValue(item, ((DateTimeOffset)column.Data.GetValue(i)).DateTime);
                                        }
                                        else if (prop.PropertyType == NullableDateTimeType)
                                        {
                                            var value = column.Data.GetValue(i);
                                            if (value != null)
                                            {
                                                prop.SetValue(item, ((DateTimeOffset)value).DateTime);
                                            }
                                        }
                                    }
                                    else
                                    {
                                        prop.SetValue(item, column.Data.GetValue(i));
                                    }
                                }
                                results.Add(item);
                            }
                        }
                    }
                }
            }

            return(results);
        }
Пример #14
0
        private IEnumerable <DataColumn[]> ReadAllObjects(ParquetReader sr, Func <object, bool?> filterFunc = null)
        {
            DataField[] dataFields = sr.Schema.GetDataFields();

            for (int i = 0; i < sr.RowGroupCount; i++)
            {
                using (ParquetRowGroupReader groupReader = sr.OpenRowGroupReader(i))
                {
                    var dc = dataFields.Select(groupReader.ReadColumn).ToArray();
                    yield return(dc);
                }
            }
        }
        /// <summary>
        ///		Obtiene un dataTable a partir de un archivo parquet
        /// </summary>
        public DataTable ParquetReaderToDataTable(string fileName, int offset, int recordCount, out int totalRecordCount)
        {
            DataTable dataTable = new DataTable();

            // Inicializa el número total de registros
            totalRecordCount = 0;
            // Lee el archivo
            using (System.IO.Stream fileReader = System.IO.File.OpenRead(fileName))
            {
                using (ParquetReader parquetReader = new ParquetReader(fileReader))
                {
                    DataField[] dataFields = parquetReader.Schema.GetDataFields();

                    // Crea las columnas en la tabla
                    CreateColumns(dataTable, dataFields);
                    //Read column by column to generate each row in the datatable
                    for (int rowGroup = 0; rowGroup < parquetReader.RowGroupCount; rowGroup++)
                    {
                        int rowsLeftToRead = recordCount;

                        using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(rowGroup))
                        {
                            if (groupReader.RowCount > int.MaxValue)
                            {
                                throw new ArgumentOutOfRangeException(string.Format("Cannot handle row group sizes greater than {0}", groupReader.RowCount));
                            }

                            int rowsPassedUntilThisRowGroup = totalRecordCount;
                            totalRecordCount += (int)groupReader.RowCount;

                            if (offset >= totalRecordCount)
                            {
                                continue;
                            }

                            if (rowsLeftToRead > 0)
                            {
                                int numberOfRecordsToReadFromThisRowGroup = Math.Min(Math.Min(totalRecordCount - offset, recordCount), (int)groupReader.RowCount);
                                rowsLeftToRead -= numberOfRecordsToReadFromThisRowGroup;

                                int recordsToSkipInThisRowGroup = Math.Max(offset - rowsPassedUntilThisRowGroup, 0);

                                ProcessRowGroup(dataTable, groupReader, dataFields, recordsToSkipInThisRowGroup, numberOfRecordsToReadFromThisRowGroup);
                            }
                        }
                    }
                }
            }
            // Devuelve los datos leidos
            return(dataTable);
        }
        public static List <Dictionary <string, object> > ReadParquetAdDictData(this Stream stream, List <string> mappedFields = null)
        {
            List <Dictionary <string, object> > results = new List <Dictionary <string, object> >();

            var bytes = stream.ReadAsBytes().GetAwaiter().GetResult();

            using (ParquetReader reader = new ParquetReader(new MemoryStream(bytes)))
            {
                DataField[] fields = reader.Schema.GetDataFields();
                for (int g = 0; g < reader.RowGroupCount; g++)
                {
                    using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(g))
                    {
                        DataColumn[] columns = fields.Select(rowGroupReader.ReadColumn).ToArray();
                        if (columns.Length > 0)
                        {
                            Dictionary <string, DataColumn> columnDict = columns.ToDictionary(c => c.Field.Name, c => c);

                            if (mappedFields != null)
                            {
                                Dictionary <string, DataColumn> mappedDict = new Dictionary <string, DataColumn>();
                                for (int i = 0; i < mappedFields.Count; i++)
                                {
                                    var mappedField = mappedFields[i];
                                    if (!mappedDict.ContainsKey(mappedField) && columnDict.ContainsKey(mappedField))
                                    {
                                        mappedDict.Add(mappedField, columnDict[mappedField]);
                                    }
                                }
                                columnDict = mappedDict;
                            }

                            for (int i = 0; i < columns[0].Data.Length; i++)
                            {
                                var item = new Dictionary <string, object>();
                                foreach (var column in columnDict.Values)
                                {
                                    item.Add(column.Field.Name, column.Data.GetValue(i));
                                }
                                results.Add(item);
                            }
                        }
                    }
                }
            }
            return(results);
        }
 private DataColumn[] ReadParquet(string name, bool treatByteArrayAsString)
 {
     using (Stream s = OpenTestFile(name))
     {
         using (var pr = new ParquetReader(s, new ParquetOptions {
             TreatByteArrayAsString = treatByteArrayAsString
         }))
         {
             using (ParquetRowGroupReader rgr = pr.OpenRowGroupReader(0))
             {
                 return(pr.Schema.GetDataFields()
                        .Select(df => rgr.ReadColumn(df))
                        .ToArray());
             }
         }
     }
 }
Пример #18
0
        protected object WriteReadSingle(DataField field, object value, CompressionMethod compressionMethod = CompressionMethod.None, int compressionLevel = -1)
        {
            //for sanity, use disconnected streams
            byte[] data;

            using (var ms = new MemoryStream())
            {
                // write single value

                using (var writer = new ParquetWriter(new Schema(field), ms))
                {
                    writer.CompressionMethod = compressionMethod;
                    writer.CompressionLevel  = compressionLevel;

                    using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                    {
                        Array dataArray = Array.CreateInstance(field.ClrNullableIfHasNullsType, 1);
                        dataArray.SetValue(value, 0);
                        var column = new DataColumn(field, dataArray);

                        rg.WriteColumn(column);
                    }
                }

                data = ms.ToArray();
            }

            using (var ms = new MemoryStream(data))
            {
                // read back single value

                ms.Position = 0;
                using (var reader = new ParquetReader(ms))
                {
                    using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(0))
                    {
                        DataColumn column = rowGroupReader.ReadColumn(field);

                        return(column.Data.GetValue(0));
                    }
                }
            }
        }
Пример #19
0
        protected DataColumn[] WriteReadSingleRowGroup(Schema schema, DataColumn[] columns, out Schema readSchema)
        {
            using (var ms = new MemoryStream())
            {
                ms.WriteSingleRowGroupParquetFile(schema, columns);
                ms.Position = 0;

                using (var reader = new ParquetReader(ms))
                {
                    readSchema = reader.Schema;

                    using (ParquetRowGroupReader rgReader = reader.OpenRowGroupReader(0))
                    {
                        return(columns.Select(c =>
                                              rgReader.ReadColumn(c.Field))
                               .ToArray());
                    }
                }
            }
        }
Пример #20
0
        protected DataColumn WriteReadSingleColumn(DataField field, DataColumn dataColumn)
        {
            using (var ms = new MemoryStream())
            {
                // write with built-in extension method
                ms.WriteSingleRowGroupParquetFile(new Schema(field), dataColumn);
                ms.Position = 0;

                // read first gow group and first column
                using (var reader = new ParquetReader(ms))
                {
                    if (reader.RowGroupCount == 0)
                    {
                        return(null);
                    }
                    ParquetRowGroupReader rgReader = reader.OpenRowGroupReader(0);

                    return(rgReader.ReadColumn(field));
                }
            }
        }
        public T[] LoadColumn <T>(DataField column)
        {
            if (_data.TryGetValue(column, out var arr))
            {
                return(arr as T[]);
            }

            //TODO: these should not be needed
            // Find the datafield we want to use
            var dataField = Array.Find(_reader.Schema.GetDataFields(), field => field.Name == column.Name);

            if (dataField == null)
            {
                throw new ArgumentException($"Couldn't find column {column.Name} in table");
            }

            T[] data = null;
            try
            {
                // Read the data pages
                for (var page = 0; page < _reader.RowGroupCount; page++)
                {
                    // TODO: Do this asynchronously?
                    var pageReader = _reader.OpenRowGroupReader(page);
                    var dataColumn = pageReader.ReadColumn(dataField);
                    var prevLength = data?.Length ?? 0;
                    Array.Resize(ref data, prevLength + dataColumn.Data.Length);
                    Array.Copy(dataColumn.Data, 0, data, prevLength, dataColumn.Data.Length);
                }
            }
            catch (ArrayTypeMismatchException ex)
            {
                throw new ArrayTypeMismatchException($"Could not load column {column.Name}. The expected data is {typeof(T)} but actual data was {dataField.DataType}.\n\n{ex.Message}");
            }

            _data[column] = data;

            return(data);
        }
Пример #22
0
        /// <summary>
        /// Reads data from parquet stream
        /// </summary>
        /// <typeparam name="TModel">Type of model</typeparam>
        /// <param name="mapConfig">Mapping configuration</param>
        /// <param name="fileStream">Parquet stream</param>
        /// <returns>parsed data</returns>
        public TModel[] Read <TModel>(MapperConfig <TModel> mapConfig, Stream fileStream)
            where TModel : new()
        {
            using var parquetReader = new ParquetReader(fileStream);
            var dataFields = parquetReader.Schema.GetDataFields();

            long modelOffset = 0;
            var  resArr      = CreateArray <TModel>(parquetReader.ThriftMetadata.Num_rows);

            for (int i = 0; i < parquetReader.RowGroupCount; i++)
            {
                using var groupReader = parquetReader.OpenRowGroupReader(i);
                var columns = dataFields.Where(w => mapConfig.Contains(w.Name)).Select(groupReader.ReadColumn).ToArray();

                ReadColumns(mapConfig, resArr, columns, modelOffset);

                // increment offset to read next rowGroup
                modelOffset += groupReader.RowCount;
            }

            return(resArr);
        }
Пример #23
0
        public void BackwardCompat_list_with_one_array()
        {
            using (Stream input = OpenTestFile("legacy-list-onearray.parquet"))
            {
                using (var reader = new ParquetReader(input))
                {
                    Schema schema = reader.Schema;

                    //validate schema
                    Assert.Equal("impurityStats", schema[3].Name);
                    Assert.Equal(SchemaType.List, schema[3].SchemaType);
                    Assert.Equal("gain", schema[4].Name);
                    Assert.Equal(SchemaType.Data, schema[4].SchemaType);

                    //smoke test we can read it
                    using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                    {
                        DataColumn values4 = rg.ReadColumn((DataField)schema[4]);
                    }
                }
            }
        }
Пример #24
0
        public void Read_multi_page_dictionary_with_nulls()
        {
            using (var reader = new ParquetReader(OpenTestFile("/special/multi_page_dictionary_with_nulls.parquet")))
            {
                DataColumn[] columns = reader.ReadEntireRowGroup();
                var          rg      = reader.OpenRowGroupReader(0);

                // reading columns
                var data = (string[])columns[0].Data;

                // ground truth from spark
                // check page boundary (first page contains 107432 rows)
                Assert.Equal("xc3w4eudww", data[107432]);
                Assert.Equal("bpywp4wtwk", data[107433]);
                Assert.Equal("z6x8652rle", data[107434]);

                // check near the end of the file
                Assert.Null(data[310028]);
                Assert.Equal("wok86kie6c", data[310029]);
                Assert.Equal("le9i7kbbib", data[310030]);
            }
        }
Пример #25
0
        public static IEnumerable <T> ReadParquet <T>(this Stream stream) where T : class, new()
        {
            Type classType = typeof(T);

            var properties = classType.GetProperties().ToDictionary(p => p.Name, p => p);

            using (ParquetReader reader = new ParquetReader(stream))
            {
                DataField[] fields = reader.Schema.GetDataFields();
                for (int g = 0; g < reader.RowGroupCount; g++)
                {
                    using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(g))
                    {
                        DataColumn[] columns = fields.Select(rowGroupReader.ReadColumn).ToArray();
                        if (columns.Length > 0)
                        {
                            for (int i = 0; i < columns[0].Data.Length; i++)
                            {
                                T item = new T();
                                foreach (var column in columns)
                                {
                                    var prop = properties[column.Field.Name];
                                    if (column.Field.DataType == DataType.DateTimeOffset && prop.PropertyType == DateTimeType)
                                    {
                                        prop.SetValue(item, ((DateTimeOffset)column.Data.GetValue(i)).DateTime);
                                    }
                                    else
                                    {
                                        prop.SetValue(item, column.Data.GetValue(i));
                                    }
                                }
                                yield return(item);
                            }
                        }
                    }
                }
            }
        }
Пример #26
0
        private IEnumerable <List <DataColumn[]> > ReadObjectsByRowGroup(ParquetReader sr, Func <object, bool?> filterFunc = null)
        {
            DataField[] dataFields = sr.Schema.GetDataFields();

            for (int i = 0; i < sr.RowGroupCount; i++)
            {
                if (RaiseBeforeRowGroupLoad(i, null))
                {
                    continue;
                }

                List <DataColumn[]> rowGroup = new List <DataColumn[]>();
                using (ParquetRowGroupReader groupReader = sr.OpenRowGroupReader(i))
                {
                    var dc = dataFields.Select(groupReader.ReadColumn).ToArray();
                    rowGroup.Add(dc);
                }
                if (!RaiseAfterRowGroupLoaded(i, rowGroup))
                {
                    yield return(rowGroup);
                }
            }
        }
Пример #27
0
        public async Task SimpleTransformation()
        {
            var settings = GetAzureSettings();

            try
            {
                using (var store = GetDocumentStore())
                {
                    var baseline = new DateTime(2020, 1, 1);

                    using (var session = store.OpenAsyncSession())
                    {
                        for (int i = 1; i <= 10; i++)
                        {
                            var o = new Order
                            {
                                Id        = $"orders/{i}",
                                OrderedAt = baseline.AddDays(i),
                                Company   = $"companies/{i}",
                                ShipVia   = $"shippers/{i}"
                            };

                            await session.StoreAsync(o);
                        }

                        await session.SaveChangesAsync();
                    }

                    var etlDone = WaitForEtl(store, (n, statistics) => statistics.LoadSuccesses != 0);

                    var script = @"
var orderDate = new Date(this.OrderedAt);
var year = orderDate.getFullYear();
var month = orderDate.getMonth();
var key = new Date(year, month);

loadToOrders(partitionBy(key),
    {
        Company : this.Company,
        ShipVia : this.ShipVia
    })
";
                    SetupAzureEtl(store, script, settings);

                    etlDone.Wait(TimeSpan.FromMinutes(1));

                    using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration))
                    {
                        var prefix = $"{settings.RemoteFolderName}/{CollectionName}";
                        var result = await client.ListBlobsAsync(prefix, delimiter : string.Empty, listFolders : false);

                        var list = result.List.ToList();
                        Assert.Equal(1, list.Count);

                        var blob = await client.GetBlobAsync(list[0].Name);

                        await using var ms = new MemoryStream();
                        blob.Data.CopyTo(ms);

                        using (var parquetReader = new ParquetReader(ms))
                        {
                            Assert.Equal(1, parquetReader.RowGroupCount);

                            var expectedFields = new[] { "Company", "ShipVia", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn };

                            Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count);

                            using var rowGroupReader = parquetReader.OpenRowGroupReader(0);
                            foreach (var field in parquetReader.Schema.Fields)
                            {
                                Assert.True(field.Name.In(expectedFields));

                                var data = rowGroupReader.ReadColumn((DataField)field).Data;
                                Assert.True(data.Length == 10);

                                if (field.Name == ParquetTransformedItems.LastModifiedColumn)
                                {
                                    continue;
                                }

                                var count = 1;
                                foreach (var val in data)
                                {
                                    switch (field.Name)
                                    {
                                    case ParquetTransformedItems.DefaultIdColumn:
                                        Assert.Equal($"orders/{count}", val);
                                        break;

                                    case "Company":
                                        Assert.Equal($"companies/{count}", val);
                                        break;

                                    case "ShipVia":
                                        Assert.Equal($"shippers/{count}", val);
                                        break;
                                    }

                                    count++;
                                }
                            }
                        }
                    }
                }
            }

            finally
            {
                await DeleteObjects(settings);
            }
        }
Пример #28
0
        public async Task CanLoadToMultipleTables()
        {
            const string salesTableName = "Sales";
            var          settings       = GetAzureSettings();

            try
            {
                using (var store = GetDocumentStore())
                {
                    var baseline = new DateTime(2020, 1, 1);

                    using (var session = store.OpenAsyncSession())
                    {
                        for (int i = 0; i < 31; i++)
                        {
                            var orderedAt = baseline.AddDays(i);
                            var lines     = new List <OrderLine>();

                            for (int j = 1; j <= 5; j++)
                            {
                                lines.Add(new OrderLine
                                {
                                    Quantity     = j * 10,
                                    PricePerUnit = i + j,
                                    Product      = $"Products/{j}"
                                });
                            }

                            var o = new Order
                            {
                                Id        = $"orders/{i}",
                                OrderedAt = orderedAt,
                                RequireAt = orderedAt.AddDays(7),
                                Company   = $"companies/{i}",
                                Lines     = lines
                            };

                            await session.StoreAsync(o);
                        }

                        baseline = baseline.AddMonths(1);

                        for (int i = 0; i < 28; i++)
                        {
                            var orderedAt = baseline.AddDays(i);
                            var lines     = new List <OrderLine>();

                            for (int j = 1; j <= 5; j++)
                            {
                                lines.Add(new OrderLine
                                {
                                    Quantity     = j * 10,
                                    PricePerUnit = i + j,
                                    Product      = $"Products/{j}"
                                });
                            }

                            var o = new Order
                            {
                                Id        = $"orders/{i + 31}",
                                OrderedAt = orderedAt,
                                RequireAt = orderedAt.AddDays(7),
                                Company   = $"companies/{i}",
                                Lines     = lines
                            };

                            await session.StoreAsync(o);
                        }

                        await session.SaveChangesAsync();
                    }

                    var etlDone = WaitForEtl(store, (n, statistics) => statistics.LoadSuccesses != 0);

                    var script = @"
var orderData = {
    Company : this.Company,
    RequireAt : new Date(this.RequireAt),
    ItemsCount: this.Lines.length,
    TotalCost: 0
};

var orderDate = new Date(this.OrderedAt);
var year = orderDate.getFullYear();
var month = orderDate.getMonth();
var key = new Date(year, month);

for (var i = 0; i < this.Lines.length; i++) {
    var line = this.Lines[i];
    orderData.TotalCost += (line.PricePerUnit * line.Quantity);
    
    // load to 'sales' table

    loadToSales(partitionBy(key), {
        Qty: line.Quantity,
        Product: line.Product,
        Cost: line.PricePerUnit
    });
}

// load to 'orders' table
loadToOrders(partitionBy(key), orderData);
";


                    SetupAzureEtl(store, script, settings);
                    etlDone.Wait(TimeSpan.FromMinutes(1));

                    using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration))
                    {
                        var prefix = $"{settings.RemoteFolderName}/{CollectionName}";
                        var result = await client.ListBlobsAsync(prefix, delimiter : string.Empty, listFolders : false);

                        var list = result.List.ToList();

                        Assert.Equal(2, list.Count);
                        Assert.Contains("2020-01-01", list[0].Name);
                        Assert.Contains("2020-02-01", list[1].Name);

                        var blob = await client.GetBlobAsync(list[0].Name);

                        await using var ms = new MemoryStream();
                        blob.Data.CopyTo(ms);

                        using (var parquetReader = new ParquetReader(ms))
                        {
                            Assert.Equal(1, parquetReader.RowGroupCount);

                            var expectedFields = new[] { "Company", "RequireAt", "ItemsCount", "TotalCost", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn };
                            Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count);

                            using var rowGroupReader = parquetReader.OpenRowGroupReader(0);
                            foreach (var field in parquetReader.Schema.Fields)
                            {
                                Assert.True(field.Name.In(expectedFields));

                                var data = rowGroupReader.ReadColumn((DataField)field).Data;
                                Assert.True(data.Length == 31);
                            }
                        }
                    }

                    //sales
                    using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration))
                    {
                        var prefix = $"{settings.RemoteFolderName}/{salesTableName}";
                        var result = await client.ListBlobsAsync(prefix, delimiter : string.Empty, listFolders : false);

                        var list = result.List.ToList();

                        Assert.Equal(2, list.Count);
                        Assert.Contains("2020-01-01", list[0].Name);
                        Assert.Contains("2020-02-01", list[1].Name);

                        var blob = await client.GetBlobAsync(list[1].Name);

                        await using var ms = new MemoryStream();
                        blob.Data.CopyTo(ms);

                        using (var parquetReader = new ParquetReader(ms))
                        {
                            Assert.Equal(1, parquetReader.RowGroupCount);

                            var expectedFields = new[] { "Qty", "Product", "Cost", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn };
                            Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count);

                            using var rowGroupReader = parquetReader.OpenRowGroupReader(0);
                            foreach (var field in parquetReader.Schema.Fields)
                            {
                                Assert.True(field.Name.In(expectedFields));

                                var data = rowGroupReader.ReadColumn((DataField)field).Data;
                                Assert.True(data.Length == 28 * 5);
                            }
                        }
                    }
                }
            }
            finally
            {
                await DeleteObjects(settings, salesTableName);
            }
        }
Пример #29
0
        public async Task SimpleTransformation_NoPartition()
        {
            var settings = GetAzureSettings();

            try
            {
                using (var store = GetDocumentStore())
                {
                    var baseline = new DateTime(2020, 1, 1).ToUniversalTime();

                    using (var session = store.OpenAsyncSession())
                    {
                        for (int i = 0; i < 100; i++)
                        {
                            await session.StoreAsync(new Order
                            {
                                Id        = $"orders/{i}",
                                OrderedAt = baseline.AddDays(i),
                                ShipVia   = $"shippers/{i}",
                                Company   = $"companies/{i}"
                            });
                        }

                        await session.SaveChangesAsync();
                    }

                    var etlDone = WaitForEtl(store, (n, statistics) => statistics.LoadSuccesses != 0);

                    var script = @"
loadToOrders(noPartition(),
    {
        OrderDate : this.OrderedAt
        Company : this.Company,
        ShipVia : this.ShipVia
    });
";
                    SetupAzureEtl(store, script, settings);

                    etlDone.Wait(TimeSpan.FromMinutes(1));

                    using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration))
                    {
                        var prefix = $"{settings.RemoteFolderName}/{CollectionName}";

                        var cloudObjects = await client.ListBlobsAsync(prefix, delimiter : string.Empty, listFolders : false);

                        var list = cloudObjects.List.ToList();

                        Assert.Equal(1, list.Count);

                        var blob = await client.GetBlobAsync(list[0].Name);

                        await using var ms = new MemoryStream();
                        blob.Data.CopyTo(ms);

                        using (var parquetReader = new ParquetReader(ms))
                        {
                            Assert.Equal(1, parquetReader.RowGroupCount);

                            var expectedFields = new[] { "OrderDate", "ShipVia", "Company", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn };

                            Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count);

                            using var rowGroupReader = parquetReader.OpenRowGroupReader(0);
                            foreach (var field in parquetReader.Schema.Fields)
                            {
                                Assert.True(field.Name.In(expectedFields));

                                var data = rowGroupReader.ReadColumn((DataField)field).Data;
                                Assert.True(data.Length == 100);

                                if (field.Name == ParquetTransformedItems.LastModifiedColumn)
                                {
                                    continue;
                                }

                                var count = 0;
                                foreach (var val in data)
                                {
                                    if (field.Name == "OrderDate")
                                    {
                                        var expectedDto = new DateTimeOffset(DateTime.SpecifyKind(baseline.AddDays(count), DateTimeKind.Utc));
                                        Assert.Equal(expectedDto, val);
                                    }

                                    else
                                    {
                                        var expected = field.Name switch
                                        {
                                            ParquetTransformedItems.DefaultIdColumn => $"orders/{count}",
                                            "Company" => $"companies/{count}",
                                            "ShipVia" => $"shippers/{count}",
                                            _ => null
                                        };

                                        Assert.Equal(expected, val);
                                    }

                                    count++;
                                }
                            }
                        }
                    }
                }
            }
            finally
            {
                await DeleteObjects(settings);
            }
        }
Пример #30
0
        public async Task SimpleTransformation_MultiplePartitions()
        {
            var settings = GetAzureSettings();
            var prefix   = $"{settings.RemoteFolderName}/{CollectionName}/";

            try
            {
                using (var store = GetDocumentStore())
                {
                    var baseline = DateTime.SpecifyKind(new DateTime(2020, 1, 1), DateTimeKind.Utc);

                    using (var session = store.OpenAsyncSession())
                    {
                        const int total = 31 + 28; // days in January + days in February

                        for (int i = 0; i < total; i++)
                        {
                            var orderedAt = baseline.AddDays(i);
                            await session.StoreAsync(new Order
                            {
                                Id        = $"orders/{i}",
                                OrderedAt = orderedAt,
                                RequireAt = orderedAt.AddDays(7),
                                ShipVia   = $"shippers/{i}",
                                Company   = $"companies/{i}"
                            });
                        }

                        for (int i = 1; i <= 37; i++)
                        {
                            var index     = i + total;
                            var orderedAt = baseline.AddYears(1).AddMonths(1).AddDays(i);
                            await session.StoreAsync(new Order
                            {
                                Id        = $"orders/{index}",
                                OrderedAt = orderedAt,
                                RequireAt = orderedAt.AddDays(7),
                                ShipVia   = $"shippers/{index}",
                                Company   = $"companies/{index}"
                            });
                        }

                        await session.SaveChangesAsync();
                    }

                    var etlDone = WaitForEtl(store, (n, statistics) => statistics.LoadSuccesses != 0 && statistics.LoadErrors == 0);

                    var script = @"
var orderDate = new Date(this.OrderedAt);

loadToOrders(partitionBy(
    ['year', orderDate.getFullYear()],
    ['month', orderDate.getMonth() + 1]
),
    {
        Company : this.Company,
        ShipVia : this.ShipVia,
        RequireAt : this.RequireAt
    });
";
                    SetupAzureEtl(store, script, settings);

                    etlDone.Wait(TimeSpan.FromMinutes(1));

                    var expectedFields = new[] { "RequireAt", "ShipVia", "Company", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn };

                    using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration))
                    {
                        var cloudObjects = await client.ListBlobsAsync(prefix, delimiter : "/", listFolders : true);

                        var list = cloudObjects.List.ToList();

                        Assert.Equal(2, list.Count);
                        Assert.Contains("Orders/year=2020/", list[0].Name);
                        Assert.Contains("Orders/year=2021/", list[1].Name);

                        for (var index = 1; index <= list.Count; index++)
                        {
                            var folder          = list[index - 1];
                            var objectsInFolder = await client.ListBlobsAsync(prefix : folder.Name, delimiter : "/", listFolders : true);

                            var objects = objectsInFolder.List.ToList();
                            Assert.Equal(2, objects.Count);
                            Assert.Contains($"month={index}/", objects[0].Name);
                            Assert.Contains($"month={index + 1}/", objects[1].Name);
                        }

                        var files = await ListAllFilesInFolders(client, list);

                        Assert.Equal(4, files.Count);

                        foreach (var filePath in files)
                        {
                            var blob = await client.GetBlobAsync(filePath);

                            await using var ms = new MemoryStream();
                            blob.Data.CopyTo(ms);

                            using (var parquetReader = new ParquetReader(ms))
                            {
                                Assert.Equal(1, parquetReader.RowGroupCount);
                                Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count);

                                using var rowGroupReader = parquetReader.OpenRowGroupReader(0);
                                foreach (var field in parquetReader.Schema.Fields)
                                {
                                    Assert.True(field.Name.In(expectedFields));
                                    var data = rowGroupReader.ReadColumn((DataField)field).Data;

                                    Assert.True(data.Length == 31 || data.Length == 28 || data.Length == 27 || data.Length == 10);
                                    if (field.Name != "RequireAt")
                                    {
                                        continue;
                                    }

                                    var count = data.Length switch
                                    {
                                        31 => 0,
                                        28 => 31,
                                        27 => 365 + 33,
                                        10 => 365 + 33 + 27,
                                        _ => throw new ArgumentOutOfRangeException()
                                    };

                                    foreach (var val in data)
                                    {
                                        var expectedOrderDate = new DateTimeOffset(DateTime.SpecifyKind(baseline.AddDays(count++), DateTimeKind.Utc));
                                        var expected          = expectedOrderDate.AddDays(7);
                                        Assert.Equal(expected, val);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            finally
            {
                await DeleteObjects(settings, prefix, delimiter : "/", listFolder : true);
            }
        }