public void FileMetadata_sets_num_rows_on_file_and_row_group_multiple_row_groups() { var ms = new MemoryStream(); var id = new DataField <int>("id"); //write using (var writer = new ParquetWriter(new Schema(id), ms)) { using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new[] { 1, 2, 3, 4 })); } using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new[] { 5, 6 })); } } //read back using (var reader = new ParquetReader(ms)) { Assert.Equal(6, reader.ThriftMetadata.Num_rows); using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(4, rg.RowCount); } using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1)) { Assert.Equal(2, rg.RowCount); } } }
public void Write_in_small_row_groups() { //write a single file having 3 row groups var id = new DataField <int>("id"); var ms = new MemoryStream(); using (var writer = new ParquetWriter(new Schema(id), ms)) { using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new int[] { 1 })); } using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new int[] { 2 })); } using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new int[] { 3 })); } } //read the file back and validate ms.Position = 0; using (var reader = new ParquetReader(ms)) { Assert.Equal(3, reader.RowGroupCount); using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(1, rg.RowCount); DataColumn dc = rg.ReadColumn(id); Assert.Equal(new int[] { 1 }, dc.Data); } using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1)) { Assert.Equal(1, rg.RowCount); DataColumn dc = rg.ReadColumn(id); Assert.Equal(new int[] { 2 }, dc.Data); } using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(2)) { Assert.Equal(1, rg.RowCount); DataColumn dc = rg.ReadColumn(id); Assert.Equal(new int[] { 3 }, dc.Data); } } }
public static Dictionary <int, string> ReadParquetFile(string infile) { Dictionary <int, string> serializedRequests = new Dictionary <int, string>(); string path = Path.GetFullPath(Directory.GetCurrentDirectory() + "/" + infile); using (Stream fileStream = File.OpenRead(path)) { using (var parquetReader = new ParquetReader(fileStream)) { DataField[] dataFields = parquetReader.Schema.GetDataFields(); using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(0)) { DataColumn[] columns = dataFields.Select(groupReader.ReadColumn).ToArray(); DataColumn firstColumn = columns[0]; DataColumn secondColumn = columns[1]; Array idData = firstColumn.Data; Array requestData = secondColumn.Data; for (var j = 0; j < firstColumn.Data.Length; j++) { var convertedRequestData = (string)requestData.GetValue(j); var convertedIdData = (int)idData.GetValue(j); serializedRequests.Add(convertedIdData, convertedRequestData); } } return(serializedRequests); } } }
public void ReadIntro() { // open file stream using (Stream fileStream = System.IO.File.OpenRead("c:\\test.parquet")) { // open parquet file reader using (var parquetReader = new ParquetReader(fileStream)) { // get file schema (available straight after opening parquet reader) // however, get only data fields as only they contain data values DataField[] dataFields = parquetReader.Schema.GetDataFields(); // enumerate through row groups in this file for (int i = 0; i < parquetReader.RowGroupCount; i++) { // create row group reader using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i)) { // read all columns inside each row group (you have an option to read only // required columns if you need to. DataColumn[] columns = dataFields.Select(groupReader.ReadColumn).ToArray(); // get first column, for instance DataColumn firstColumn = columns[0]; // .Data member contains a typed array of column data you can cast to the type of the column Array data = firstColumn.Data; int[] ids = (int[])data; } } } } }
public IEnumerable <DataColumn[]> GetData(string file) { using (Stream fileStream = System.IO.File.OpenRead(file)) { // open parquet file reader using (var parquetReader = new ParquetReader(fileStream)) { // get file schema (available straight after opening parquet reader) // however, get only data fields as only they contain data values DataField[] dataFields = parquetReader.Schema.GetDataFields(); // enumerate through row groups in this file for (int i = 0; i < parquetReader.RowGroupCount; i++) { // create row group reader using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i)) { // read all columns inside each row group (you have an option to read only // required columns if you need to. yield return(dataFields.Select(groupReader.ReadColumn).ToArray()); } } } } }
public void Write_read_nullable_column(Array input) { var id = new DataField <int?>("id"); var ms = new MemoryStream(); using (var writer = new ParquetWriter(new Schema(id), ms)) { using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, input)); } } ms.Position = 0; using (var reader = new ParquetReader(ms)) { Assert.Equal(1, reader.RowGroupCount); using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(input.Length, rg.RowCount); Assert.Equal(input, rg.ReadColumn(id).Data); } } }
public static DataTable ParquetReaderToDataTable(ParquetReader parquetReader, out int totalRecordCount, List <string> selectedFields, int offset, int recordCount) { //Get list of data fields and construct the DataTable DataTable dataTable = new DataTable(); List <Parquet.Data.DataField> fields = new List <Parquet.Data.DataField>(); var dataFields = parquetReader.Schema.GetDataFields(); foreach (string selectedField in selectedFields) { var dataField = dataFields.FirstOrDefault(f => f.Name.Equals(selectedField, StringComparison.InvariantCultureIgnoreCase)); if (dataField != null) { fields.Add(dataField); DataColumn newColumn = new DataColumn(dataField.Name, ParquetNetTypeToCSharpType(dataField.DataType)) { // Should not set this, or line 89 in ProcessRowGroup() will throw an exception with any required field (because assigning later than adding) //AllowDBNull = dataField.HasNulls }; dataTable.Columns.Add(newColumn); } else { throw new Exception(string.Format("Field '{0}' does not exist", selectedField)); } } //Read column by column to generate each row in the datatable totalRecordCount = 0; for (int i = 0; i < parquetReader.RowGroupCount; i++) { int rowsLeftToRead = recordCount; using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i)) { if (groupReader.RowCount > int.MaxValue) { throw new ArgumentOutOfRangeException(string.Format("Cannot handle row group sizes greater than {0}", groupReader.RowCount)); } int rowsPassedUntilThisRowGroup = totalRecordCount; totalRecordCount += (int)groupReader.RowCount; if (offset >= totalRecordCount) { continue; } if (rowsLeftToRead > 0) { int numberOfRecordsToReadFromThisRowGroup = Math.Min(Math.Min(totalRecordCount - offset, recordCount), (int)groupReader.RowCount); rowsLeftToRead -= numberOfRecordsToReadFromThisRowGroup; int recordsToSkipInThisRowGroup = Math.Max(offset - rowsPassedUntilThisRowGroup, 0); ProcessRowGroup(dataTable, groupReader, fields, recordsToSkipInThisRowGroup, numberOfRecordsToReadFromThisRowGroup); } } } return(dataTable); }
public void List_of_elements_with_some_items_empty_reads_file() { /* * list data: * - 1: [1, 2, 3] * - 2: [] * - 3: [1, 2, 3] * - 4: [] */ using (var reader = new ParquetReader(OpenTestFile("listofitems-empty-alternates.parquet"))) { using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(0)) { Assert.Equal(4, groupReader.RowCount); DataField[] fs = reader.Schema.GetDataFields(); DataColumn id = groupReader.ReadColumn(fs[0]); Assert.Equal(4, id.Data.Length); Assert.False(id.HasRepetitions); DataColumn list = groupReader.ReadColumn(fs[1]); Assert.Equal(8, list.Data.Length); Assert.Equal(new int[] { 0, 1, 1, 0, 0, 1, 1, 0 }, list.RepetitionLevels); } } }
private bool LoadRowGroup(int rowGroup) { using (var reader = _parquetReader.OpenRowGroupReader(rowGroup)) { _rowIndex = 0; _rowCount = reader.RowCount; var columns = new List <Parquet.Data.DataColumn>(); foreach (var f in _fields) { try { columns.Add(reader.ReadColumn(f)); } catch (IndexOutOfRangeException) { // this happens if every single element in the column within the rowgroup is null, // TODO: Figure out how to read the header to detect this without an exception columns.Add(null); } } _columns = columns; } return(true); }
public static void TestAgainstThirdParty() { var columns = new Column[] { new Column <decimal>("Decimal", LogicalType.Decimal(precision: 29, scale: 3)) }; var values = Enumerable.Range(0, 10_000) .Select(i => ((decimal)i * i * i) / 1000 - 10) .Concat(new [] { decimal.MinValue / 1000, decimal.MaxValue / 1000 }) .ToArray(); using var buffer = new ResizableBuffer(); // Write using ParquetSharp using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, columns, Compression.Snappy); using var rowGroupWriter = fileWriter.AppendRowGroup(); using var columnWriter = rowGroupWriter.NextColumn().LogicalWriter <decimal>(); columnWriter.WriteBatch(values); fileWriter.Close(); } // Read using Parquet.NET using var memoryStream = new MemoryStream(buffer.ToArray()); using var fileReader = new ParquetReader(memoryStream); using var rowGroupReader = fileReader.OpenRowGroupReader(0); var read = (decimal[])rowGroupReader.ReadColumn(fileReader.Schema.GetDataFields()[0]).Data; Assert.AreEqual(values, read); }
public void Write_multiple_row_groups_to_forward_only_stream() { var ms = new MemoryStream(); var forwardOnly = new WriteableNonSeekableStream(ms); var schema = new Schema( new DataField <int>("id"), new DataField <string>("nonsense")); using (var writer = new ParquetWriter(schema, forwardOnly)) { using (ParquetRowGroupWriter rgw = writer.CreateRowGroup(1)) { rgw.WriteColumn(new DataColumn((DataField)schema[0], new[] { 1 })); rgw.WriteColumn(new DataColumn((DataField)schema[1], new[] { "1" })); } using (ParquetRowGroupWriter rgw = writer.CreateRowGroup(1)) { rgw.WriteColumn(new DataColumn((DataField)schema[0], new[] { 2 })); rgw.WriteColumn(new DataColumn((DataField)schema[1], new[] { "2" })); } } ms.Position = 0; using (var reader = new ParquetReader(ms)) { Assert.Equal(2, reader.RowGroupCount); using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0)) { Assert.Equal(1, rgr.RowCount); DataColumn column = rgr.ReadColumn((DataField)schema[0]); Assert.Equal(1, column.Data.GetValue(0)); } using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(1)) { Assert.Equal(1, rgr.RowCount); DataColumn column = rgr.ReadColumn((DataField)schema[0]); Assert.Equal(2, column.Data.GetValue(0)); } } }
public void Append_to_file_reads_all_data() { //write a file with a single row group var id = new DataField <int>("id"); var ms = new MemoryStream(); using (var writer = new ParquetWriter(new Schema(id), ms)) { using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new int[] { 1, 2 })); } } //append to this file. Note that you cannot append to existing row group, therefore create a new one ms.Position = 0; using (var writer = new ParquetWriter(new Schema(id), ms, append: true)) { using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new int[] { 3, 4 })); } } //check that this file now contains two row groups and all the data is valid ms.Position = 0; using (var reader = new ParquetReader(ms)) { Assert.Equal(2, reader.RowGroupCount); using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(2, rg.RowCount); Assert.Equal(new int[] { 1, 2 }, rg.ReadColumn(id).Data); } using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1)) { Assert.Equal(2, rg.RowCount); Assert.Equal(new int[] { 3, 4 }, rg.ReadColumn(id).Data); } } }
public static List <T> ReadParquet <T>(this Stream stream) where T : class, new() { Type classType = typeof(T); List <T> results = new List <T>(); var properties = classType.GetProperties().ToDictionary(p => p.Name, p => p); var bytes = stream.ReadAsBytes().GetAwaiter().GetResult(); using (ParquetReader reader = new ParquetReader(new MemoryStream(bytes))) { DataField[] fields = reader.Schema.GetDataFields(); for (int g = 0; g < reader.RowGroupCount; g++) { using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(g)) { DataColumn[] columns = fields.Select(rowGroupReader.ReadColumn).ToArray(); if (columns.Length > 0) { for (int i = 0; i < columns[0].Data.Length; i++) { T item = new T(); foreach (var column in columns) { var prop = properties[column.Field.Name]; if (column.Field.DataType == ParquetDataType.DateTimeOffset) { if (prop.PropertyType == DateTimeType) { prop.SetValue(item, ((DateTimeOffset)column.Data.GetValue(i)).DateTime); } else if (prop.PropertyType == NullableDateTimeType) { var value = column.Data.GetValue(i); if (value != null) { prop.SetValue(item, ((DateTimeOffset)value).DateTime); } } } else { prop.SetValue(item, column.Data.GetValue(i)); } } results.Add(item); } } } } } return(results); }
private IEnumerable <DataColumn[]> ReadAllObjects(ParquetReader sr, Func <object, bool?> filterFunc = null) { DataField[] dataFields = sr.Schema.GetDataFields(); for (int i = 0; i < sr.RowGroupCount; i++) { using (ParquetRowGroupReader groupReader = sr.OpenRowGroupReader(i)) { var dc = dataFields.Select(groupReader.ReadColumn).ToArray(); yield return(dc); } } }
/// <summary> /// Obtiene un dataTable a partir de un archivo parquet /// </summary> public DataTable ParquetReaderToDataTable(string fileName, int offset, int recordCount, out int totalRecordCount) { DataTable dataTable = new DataTable(); // Inicializa el número total de registros totalRecordCount = 0; // Lee el archivo using (System.IO.Stream fileReader = System.IO.File.OpenRead(fileName)) { using (ParquetReader parquetReader = new ParquetReader(fileReader)) { DataField[] dataFields = parquetReader.Schema.GetDataFields(); // Crea las columnas en la tabla CreateColumns(dataTable, dataFields); //Read column by column to generate each row in the datatable for (int rowGroup = 0; rowGroup < parquetReader.RowGroupCount; rowGroup++) { int rowsLeftToRead = recordCount; using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(rowGroup)) { if (groupReader.RowCount > int.MaxValue) { throw new ArgumentOutOfRangeException(string.Format("Cannot handle row group sizes greater than {0}", groupReader.RowCount)); } int rowsPassedUntilThisRowGroup = totalRecordCount; totalRecordCount += (int)groupReader.RowCount; if (offset >= totalRecordCount) { continue; } if (rowsLeftToRead > 0) { int numberOfRecordsToReadFromThisRowGroup = Math.Min(Math.Min(totalRecordCount - offset, recordCount), (int)groupReader.RowCount); rowsLeftToRead -= numberOfRecordsToReadFromThisRowGroup; int recordsToSkipInThisRowGroup = Math.Max(offset - rowsPassedUntilThisRowGroup, 0); ProcessRowGroup(dataTable, groupReader, dataFields, recordsToSkipInThisRowGroup, numberOfRecordsToReadFromThisRowGroup); } } } } } // Devuelve los datos leidos return(dataTable); }
public static List <Dictionary <string, object> > ReadParquetAdDictData(this Stream stream, List <string> mappedFields = null) { List <Dictionary <string, object> > results = new List <Dictionary <string, object> >(); var bytes = stream.ReadAsBytes().GetAwaiter().GetResult(); using (ParquetReader reader = new ParquetReader(new MemoryStream(bytes))) { DataField[] fields = reader.Schema.GetDataFields(); for (int g = 0; g < reader.RowGroupCount; g++) { using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(g)) { DataColumn[] columns = fields.Select(rowGroupReader.ReadColumn).ToArray(); if (columns.Length > 0) { Dictionary <string, DataColumn> columnDict = columns.ToDictionary(c => c.Field.Name, c => c); if (mappedFields != null) { Dictionary <string, DataColumn> mappedDict = new Dictionary <string, DataColumn>(); for (int i = 0; i < mappedFields.Count; i++) { var mappedField = mappedFields[i]; if (!mappedDict.ContainsKey(mappedField) && columnDict.ContainsKey(mappedField)) { mappedDict.Add(mappedField, columnDict[mappedField]); } } columnDict = mappedDict; } for (int i = 0; i < columns[0].Data.Length; i++) { var item = new Dictionary <string, object>(); foreach (var column in columnDict.Values) { item.Add(column.Field.Name, column.Data.GetValue(i)); } results.Add(item); } } } } } return(results); }
private DataColumn[] ReadParquet(string name, bool treatByteArrayAsString) { using (Stream s = OpenTestFile(name)) { using (var pr = new ParquetReader(s, new ParquetOptions { TreatByteArrayAsString = treatByteArrayAsString })) { using (ParquetRowGroupReader rgr = pr.OpenRowGroupReader(0)) { return(pr.Schema.GetDataFields() .Select(df => rgr.ReadColumn(df)) .ToArray()); } } } }
protected object WriteReadSingle(DataField field, object value, CompressionMethod compressionMethod = CompressionMethod.None, int compressionLevel = -1) { //for sanity, use disconnected streams byte[] data; using (var ms = new MemoryStream()) { // write single value using (var writer = new ParquetWriter(new Schema(field), ms)) { writer.CompressionMethod = compressionMethod; writer.CompressionLevel = compressionLevel; using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { Array dataArray = Array.CreateInstance(field.ClrNullableIfHasNullsType, 1); dataArray.SetValue(value, 0); var column = new DataColumn(field, dataArray); rg.WriteColumn(column); } } data = ms.ToArray(); } using (var ms = new MemoryStream(data)) { // read back single value ms.Position = 0; using (var reader = new ParquetReader(ms)) { using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(0)) { DataColumn column = rowGroupReader.ReadColumn(field); return(column.Data.GetValue(0)); } } } }
protected DataColumn[] WriteReadSingleRowGroup(Schema schema, DataColumn[] columns, out Schema readSchema) { using (var ms = new MemoryStream()) { ms.WriteSingleRowGroupParquetFile(schema, columns); ms.Position = 0; using (var reader = new ParquetReader(ms)) { readSchema = reader.Schema; using (ParquetRowGroupReader rgReader = reader.OpenRowGroupReader(0)) { return(columns.Select(c => rgReader.ReadColumn(c.Field)) .ToArray()); } } } }
protected DataColumn WriteReadSingleColumn(DataField field, DataColumn dataColumn) { using (var ms = new MemoryStream()) { // write with built-in extension method ms.WriteSingleRowGroupParquetFile(new Schema(field), dataColumn); ms.Position = 0; // read first gow group and first column using (var reader = new ParquetReader(ms)) { if (reader.RowGroupCount == 0) { return(null); } ParquetRowGroupReader rgReader = reader.OpenRowGroupReader(0); return(rgReader.ReadColumn(field)); } } }
public T[] LoadColumn <T>(DataField column) { if (_data.TryGetValue(column, out var arr)) { return(arr as T[]); } //TODO: these should not be needed // Find the datafield we want to use var dataField = Array.Find(_reader.Schema.GetDataFields(), field => field.Name == column.Name); if (dataField == null) { throw new ArgumentException($"Couldn't find column {column.Name} in table"); } T[] data = null; try { // Read the data pages for (var page = 0; page < _reader.RowGroupCount; page++) { // TODO: Do this asynchronously? var pageReader = _reader.OpenRowGroupReader(page); var dataColumn = pageReader.ReadColumn(dataField); var prevLength = data?.Length ?? 0; Array.Resize(ref data, prevLength + dataColumn.Data.Length); Array.Copy(dataColumn.Data, 0, data, prevLength, dataColumn.Data.Length); } } catch (ArrayTypeMismatchException ex) { throw new ArrayTypeMismatchException($"Could not load column {column.Name}. The expected data is {typeof(T)} but actual data was {dataField.DataType}.\n\n{ex.Message}"); } _data[column] = data; return(data); }
/// <summary> /// Reads data from parquet stream /// </summary> /// <typeparam name="TModel">Type of model</typeparam> /// <param name="mapConfig">Mapping configuration</param> /// <param name="fileStream">Parquet stream</param> /// <returns>parsed data</returns> public TModel[] Read <TModel>(MapperConfig <TModel> mapConfig, Stream fileStream) where TModel : new() { using var parquetReader = new ParquetReader(fileStream); var dataFields = parquetReader.Schema.GetDataFields(); long modelOffset = 0; var resArr = CreateArray <TModel>(parquetReader.ThriftMetadata.Num_rows); for (int i = 0; i < parquetReader.RowGroupCount; i++) { using var groupReader = parquetReader.OpenRowGroupReader(i); var columns = dataFields.Where(w => mapConfig.Contains(w.Name)).Select(groupReader.ReadColumn).ToArray(); ReadColumns(mapConfig, resArr, columns, modelOffset); // increment offset to read next rowGroup modelOffset += groupReader.RowCount; } return(resArr); }
public void BackwardCompat_list_with_one_array() { using (Stream input = OpenTestFile("legacy-list-onearray.parquet")) { using (var reader = new ParquetReader(input)) { Schema schema = reader.Schema; //validate schema Assert.Equal("impurityStats", schema[3].Name); Assert.Equal(SchemaType.List, schema[3].SchemaType); Assert.Equal("gain", schema[4].Name); Assert.Equal(SchemaType.Data, schema[4].SchemaType); //smoke test we can read it using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { DataColumn values4 = rg.ReadColumn((DataField)schema[4]); } } } }
public void Read_multi_page_dictionary_with_nulls() { using (var reader = new ParquetReader(OpenTestFile("/special/multi_page_dictionary_with_nulls.parquet"))) { DataColumn[] columns = reader.ReadEntireRowGroup(); var rg = reader.OpenRowGroupReader(0); // reading columns var data = (string[])columns[0].Data; // ground truth from spark // check page boundary (first page contains 107432 rows) Assert.Equal("xc3w4eudww", data[107432]); Assert.Equal("bpywp4wtwk", data[107433]); Assert.Equal("z6x8652rle", data[107434]); // check near the end of the file Assert.Null(data[310028]); Assert.Equal("wok86kie6c", data[310029]); Assert.Equal("le9i7kbbib", data[310030]); } }
public static IEnumerable <T> ReadParquet <T>(this Stream stream) where T : class, new() { Type classType = typeof(T); var properties = classType.GetProperties().ToDictionary(p => p.Name, p => p); using (ParquetReader reader = new ParquetReader(stream)) { DataField[] fields = reader.Schema.GetDataFields(); for (int g = 0; g < reader.RowGroupCount; g++) { using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(g)) { DataColumn[] columns = fields.Select(rowGroupReader.ReadColumn).ToArray(); if (columns.Length > 0) { for (int i = 0; i < columns[0].Data.Length; i++) { T item = new T(); foreach (var column in columns) { var prop = properties[column.Field.Name]; if (column.Field.DataType == DataType.DateTimeOffset && prop.PropertyType == DateTimeType) { prop.SetValue(item, ((DateTimeOffset)column.Data.GetValue(i)).DateTime); } else { prop.SetValue(item, column.Data.GetValue(i)); } } yield return(item); } } } } } }
private IEnumerable <List <DataColumn[]> > ReadObjectsByRowGroup(ParquetReader sr, Func <object, bool?> filterFunc = null) { DataField[] dataFields = sr.Schema.GetDataFields(); for (int i = 0; i < sr.RowGroupCount; i++) { if (RaiseBeforeRowGroupLoad(i, null)) { continue; } List <DataColumn[]> rowGroup = new List <DataColumn[]>(); using (ParquetRowGroupReader groupReader = sr.OpenRowGroupReader(i)) { var dc = dataFields.Select(groupReader.ReadColumn).ToArray(); rowGroup.Add(dc); } if (!RaiseAfterRowGroupLoaded(i, rowGroup)) { yield return(rowGroup); } } }
public async Task SimpleTransformation() { var settings = GetAzureSettings(); try { using (var store = GetDocumentStore()) { var baseline = new DateTime(2020, 1, 1); using (var session = store.OpenAsyncSession()) { for (int i = 1; i <= 10; i++) { var o = new Order { Id = $"orders/{i}", OrderedAt = baseline.AddDays(i), Company = $"companies/{i}", ShipVia = $"shippers/{i}" }; await session.StoreAsync(o); } await session.SaveChangesAsync(); } var etlDone = WaitForEtl(store, (n, statistics) => statistics.LoadSuccesses != 0); var script = @" var orderDate = new Date(this.OrderedAt); var year = orderDate.getFullYear(); var month = orderDate.getMonth(); var key = new Date(year, month); loadToOrders(partitionBy(key), { Company : this.Company, ShipVia : this.ShipVia }) "; SetupAzureEtl(store, script, settings); etlDone.Wait(TimeSpan.FromMinutes(1)); using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration)) { var prefix = $"{settings.RemoteFolderName}/{CollectionName}"; var result = await client.ListBlobsAsync(prefix, delimiter : string.Empty, listFolders : false); var list = result.List.ToList(); Assert.Equal(1, list.Count); var blob = await client.GetBlobAsync(list[0].Name); await using var ms = new MemoryStream(); blob.Data.CopyTo(ms); using (var parquetReader = new ParquetReader(ms)) { Assert.Equal(1, parquetReader.RowGroupCount); var expectedFields = new[] { "Company", "ShipVia", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn }; Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count); using var rowGroupReader = parquetReader.OpenRowGroupReader(0); foreach (var field in parquetReader.Schema.Fields) { Assert.True(field.Name.In(expectedFields)); var data = rowGroupReader.ReadColumn((DataField)field).Data; Assert.True(data.Length == 10); if (field.Name == ParquetTransformedItems.LastModifiedColumn) { continue; } var count = 1; foreach (var val in data) { switch (field.Name) { case ParquetTransformedItems.DefaultIdColumn: Assert.Equal($"orders/{count}", val); break; case "Company": Assert.Equal($"companies/{count}", val); break; case "ShipVia": Assert.Equal($"shippers/{count}", val); break; } count++; } } } } } } finally { await DeleteObjects(settings); } }
public async Task CanLoadToMultipleTables() { const string salesTableName = "Sales"; var settings = GetAzureSettings(); try { using (var store = GetDocumentStore()) { var baseline = new DateTime(2020, 1, 1); using (var session = store.OpenAsyncSession()) { for (int i = 0; i < 31; i++) { var orderedAt = baseline.AddDays(i); var lines = new List <OrderLine>(); for (int j = 1; j <= 5; j++) { lines.Add(new OrderLine { Quantity = j * 10, PricePerUnit = i + j, Product = $"Products/{j}" }); } var o = new Order { Id = $"orders/{i}", OrderedAt = orderedAt, RequireAt = orderedAt.AddDays(7), Company = $"companies/{i}", Lines = lines }; await session.StoreAsync(o); } baseline = baseline.AddMonths(1); for (int i = 0; i < 28; i++) { var orderedAt = baseline.AddDays(i); var lines = new List <OrderLine>(); for (int j = 1; j <= 5; j++) { lines.Add(new OrderLine { Quantity = j * 10, PricePerUnit = i + j, Product = $"Products/{j}" }); } var o = new Order { Id = $"orders/{i + 31}", OrderedAt = orderedAt, RequireAt = orderedAt.AddDays(7), Company = $"companies/{i}", Lines = lines }; await session.StoreAsync(o); } await session.SaveChangesAsync(); } var etlDone = WaitForEtl(store, (n, statistics) => statistics.LoadSuccesses != 0); var script = @" var orderData = { Company : this.Company, RequireAt : new Date(this.RequireAt), ItemsCount: this.Lines.length, TotalCost: 0 }; var orderDate = new Date(this.OrderedAt); var year = orderDate.getFullYear(); var month = orderDate.getMonth(); var key = new Date(year, month); for (var i = 0; i < this.Lines.length; i++) { var line = this.Lines[i]; orderData.TotalCost += (line.PricePerUnit * line.Quantity); // load to 'sales' table loadToSales(partitionBy(key), { Qty: line.Quantity, Product: line.Product, Cost: line.PricePerUnit }); } // load to 'orders' table loadToOrders(partitionBy(key), orderData); "; SetupAzureEtl(store, script, settings); etlDone.Wait(TimeSpan.FromMinutes(1)); using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration)) { var prefix = $"{settings.RemoteFolderName}/{CollectionName}"; var result = await client.ListBlobsAsync(prefix, delimiter : string.Empty, listFolders : false); var list = result.List.ToList(); Assert.Equal(2, list.Count); Assert.Contains("2020-01-01", list[0].Name); Assert.Contains("2020-02-01", list[1].Name); var blob = await client.GetBlobAsync(list[0].Name); await using var ms = new MemoryStream(); blob.Data.CopyTo(ms); using (var parquetReader = new ParquetReader(ms)) { Assert.Equal(1, parquetReader.RowGroupCount); var expectedFields = new[] { "Company", "RequireAt", "ItemsCount", "TotalCost", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn }; Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count); using var rowGroupReader = parquetReader.OpenRowGroupReader(0); foreach (var field in parquetReader.Schema.Fields) { Assert.True(field.Name.In(expectedFields)); var data = rowGroupReader.ReadColumn((DataField)field).Data; Assert.True(data.Length == 31); } } } //sales using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration)) { var prefix = $"{settings.RemoteFolderName}/{salesTableName}"; var result = await client.ListBlobsAsync(prefix, delimiter : string.Empty, listFolders : false); var list = result.List.ToList(); Assert.Equal(2, list.Count); Assert.Contains("2020-01-01", list[0].Name); Assert.Contains("2020-02-01", list[1].Name); var blob = await client.GetBlobAsync(list[1].Name); await using var ms = new MemoryStream(); blob.Data.CopyTo(ms); using (var parquetReader = new ParquetReader(ms)) { Assert.Equal(1, parquetReader.RowGroupCount); var expectedFields = new[] { "Qty", "Product", "Cost", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn }; Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count); using var rowGroupReader = parquetReader.OpenRowGroupReader(0); foreach (var field in parquetReader.Schema.Fields) { Assert.True(field.Name.In(expectedFields)); var data = rowGroupReader.ReadColumn((DataField)field).Data; Assert.True(data.Length == 28 * 5); } } } } } finally { await DeleteObjects(settings, salesTableName); } }
public async Task SimpleTransformation_NoPartition() { var settings = GetAzureSettings(); try { using (var store = GetDocumentStore()) { var baseline = new DateTime(2020, 1, 1).ToUniversalTime(); using (var session = store.OpenAsyncSession()) { for (int i = 0; i < 100; i++) { await session.StoreAsync(new Order { Id = $"orders/{i}", OrderedAt = baseline.AddDays(i), ShipVia = $"shippers/{i}", Company = $"companies/{i}" }); } await session.SaveChangesAsync(); } var etlDone = WaitForEtl(store, (n, statistics) => statistics.LoadSuccesses != 0); var script = @" loadToOrders(noPartition(), { OrderDate : this.OrderedAt Company : this.Company, ShipVia : this.ShipVia }); "; SetupAzureEtl(store, script, settings); etlDone.Wait(TimeSpan.FromMinutes(1)); using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration)) { var prefix = $"{settings.RemoteFolderName}/{CollectionName}"; var cloudObjects = await client.ListBlobsAsync(prefix, delimiter : string.Empty, listFolders : false); var list = cloudObjects.List.ToList(); Assert.Equal(1, list.Count); var blob = await client.GetBlobAsync(list[0].Name); await using var ms = new MemoryStream(); blob.Data.CopyTo(ms); using (var parquetReader = new ParquetReader(ms)) { Assert.Equal(1, parquetReader.RowGroupCount); var expectedFields = new[] { "OrderDate", "ShipVia", "Company", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn }; Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count); using var rowGroupReader = parquetReader.OpenRowGroupReader(0); foreach (var field in parquetReader.Schema.Fields) { Assert.True(field.Name.In(expectedFields)); var data = rowGroupReader.ReadColumn((DataField)field).Data; Assert.True(data.Length == 100); if (field.Name == ParquetTransformedItems.LastModifiedColumn) { continue; } var count = 0; foreach (var val in data) { if (field.Name == "OrderDate") { var expectedDto = new DateTimeOffset(DateTime.SpecifyKind(baseline.AddDays(count), DateTimeKind.Utc)); Assert.Equal(expectedDto, val); } else { var expected = field.Name switch { ParquetTransformedItems.DefaultIdColumn => $"orders/{count}", "Company" => $"companies/{count}", "ShipVia" => $"shippers/{count}", _ => null }; Assert.Equal(expected, val); } count++; } } } } } } finally { await DeleteObjects(settings); } }
public async Task SimpleTransformation_MultiplePartitions() { var settings = GetAzureSettings(); var prefix = $"{settings.RemoteFolderName}/{CollectionName}/"; try { using (var store = GetDocumentStore()) { var baseline = DateTime.SpecifyKind(new DateTime(2020, 1, 1), DateTimeKind.Utc); using (var session = store.OpenAsyncSession()) { const int total = 31 + 28; // days in January + days in February for (int i = 0; i < total; i++) { var orderedAt = baseline.AddDays(i); await session.StoreAsync(new Order { Id = $"orders/{i}", OrderedAt = orderedAt, RequireAt = orderedAt.AddDays(7), ShipVia = $"shippers/{i}", Company = $"companies/{i}" }); } for (int i = 1; i <= 37; i++) { var index = i + total; var orderedAt = baseline.AddYears(1).AddMonths(1).AddDays(i); await session.StoreAsync(new Order { Id = $"orders/{index}", OrderedAt = orderedAt, RequireAt = orderedAt.AddDays(7), ShipVia = $"shippers/{index}", Company = $"companies/{index}" }); } await session.SaveChangesAsync(); } var etlDone = WaitForEtl(store, (n, statistics) => statistics.LoadSuccesses != 0 && statistics.LoadErrors == 0); var script = @" var orderDate = new Date(this.OrderedAt); loadToOrders(partitionBy( ['year', orderDate.getFullYear()], ['month', orderDate.getMonth() + 1] ), { Company : this.Company, ShipVia : this.ShipVia, RequireAt : this.RequireAt }); "; SetupAzureEtl(store, script, settings); etlDone.Wait(TimeSpan.FromMinutes(1)); var expectedFields = new[] { "RequireAt", "ShipVia", "Company", ParquetTransformedItems.DefaultIdColumn, ParquetTransformedItems.LastModifiedColumn }; using (var client = RavenAzureClient.Create(settings, DefaultBackupConfiguration)) { var cloudObjects = await client.ListBlobsAsync(prefix, delimiter : "/", listFolders : true); var list = cloudObjects.List.ToList(); Assert.Equal(2, list.Count); Assert.Contains("Orders/year=2020/", list[0].Name); Assert.Contains("Orders/year=2021/", list[1].Name); for (var index = 1; index <= list.Count; index++) { var folder = list[index - 1]; var objectsInFolder = await client.ListBlobsAsync(prefix : folder.Name, delimiter : "/", listFolders : true); var objects = objectsInFolder.List.ToList(); Assert.Equal(2, objects.Count); Assert.Contains($"month={index}/", objects[0].Name); Assert.Contains($"month={index + 1}/", objects[1].Name); } var files = await ListAllFilesInFolders(client, list); Assert.Equal(4, files.Count); foreach (var filePath in files) { var blob = await client.GetBlobAsync(filePath); await using var ms = new MemoryStream(); blob.Data.CopyTo(ms); using (var parquetReader = new ParquetReader(ms)) { Assert.Equal(1, parquetReader.RowGroupCount); Assert.Equal(expectedFields.Length, parquetReader.Schema.Fields.Count); using var rowGroupReader = parquetReader.OpenRowGroupReader(0); foreach (var field in parquetReader.Schema.Fields) { Assert.True(field.Name.In(expectedFields)); var data = rowGroupReader.ReadColumn((DataField)field).Data; Assert.True(data.Length == 31 || data.Length == 28 || data.Length == 27 || data.Length == 10); if (field.Name != "RequireAt") { continue; } var count = data.Length switch { 31 => 0, 28 => 31, 27 => 365 + 33, 10 => 365 + 33 + 27, _ => throw new ArgumentOutOfRangeException() }; foreach (var val in data) { var expectedOrderDate = new DateTimeOffset(DateTime.SpecifyKind(baseline.AddDays(count++), DateTimeKind.Utc)); var expected = expectedOrderDate.AddDays(7); Assert.Equal(expected, val); } } } } } } } finally { await DeleteObjects(settings, prefix, delimiter : "/", listFolder : true); } }