public void Extract_array_columns() { Schema schema = SchemaReflector.Reflect <ArrayColumns>(); Assert.Equal(2, schema.Length); var extractor = new ColumnExtractor(); ArrayColumns[] ac = { new ArrayColumns { Id = 1, Addresses = new[]{ "Fiddler", "On" } }, new ArrayColumns { Id = 2, Addresses = new[]{ "The", "Roof" } } }; List <DataColumn> columns = extractor.ExtractColumns(ac, schema).ToList(); Assert.Equal(new[] { 1, 2 }, columns[0].DefinedData); Assert.Equal(new[] { "Fiddler", "On", "The", "Roof" }, columns[1].DefinedData); Assert.Equal(new[] { 0, 1, 0, 1 }, columns[1].RepetitionLevels); }
/// <summary> /// /// </summary> /// <typeparam name="T"></typeparam> /// <param name="input"></param> /// <param name="rowGroupIndex"></param> /// <returns></returns> public static T[] Deserialize <T>(Stream input, int rowGroupIndex = -1) where T : new() { var result = new List <T>(); using (var reader = new ParquetReader(input)) { Schema fileSchema = new SchemaReflector(typeof(T)).Reflect(); DataField[] dataFields = fileSchema.GetDataFields(); if (rowGroupIndex == -1) //Means read all row groups. { for (int i = 0; i < reader.RowGroupCount; i++) { T[] currentRowGroupRecords = ReadAndDeserializeByRowGroup <T>(i, reader, dataFields); result.AddRange(currentRowGroupRecords); } } else //read specific rowgroup. { T[] currentRowGroupRecords = ReadAndDeserializeByRowGroup <T>(rowGroupIndex, reader, dataFields); result.AddRange(currentRowGroupRecords); } } return(result.ToArray()); }
public AssignArrayDelegate GenerateAssigner(DataColumn dataColumn, Type classType) { DataField fileField = dataColumn.Field; Schema typeSchema = SchemaReflector.Reflect(classType); DataField typeField = typeSchema.FindDataField(fileField.Path); Type[] methodArgs = { typeof(DataColumn), typeof(Array) }; var runMethod = new DynamicMethod( $"Set{classType.Name}{typeField.ClrPropName}", typeof(void), methodArgs, GetType().GetTypeInfo().Module); ILGenerator il = runMethod.GetILGenerator(); //set class property method TypeInfo ti = classType.GetTypeInfo(); PropertyInfo pi = ti.GetDeclaredProperty(typeField.ClrPropName ?? typeField.Name); MethodInfo setValueMethod = pi.SetMethod; TypeInfo dcti = dataColumn.GetType().GetTypeInfo(); MethodInfo getDataMethod = dcti.GetDeclaredProperty(nameof(DataColumn.Data)).GetMethod; MethodInfo getRepsMethod = dcti.GetDeclaredProperty(nameof(DataColumn.RepetitionLevels)).GetMethod; TypeConversion conversion = GetConversion(dataColumn.Field.ClrNullableIfHasNullsType, pi.PropertyType); GenerateAssigner(il, classType, typeField, setValueMethod, getDataMethod, getRepsMethod, conversion); return((AssignArrayDelegate)runMethod.CreateDelegate(typeof(AssignArrayDelegate))); }
public static Stream GetParquetFileWithThreeRowGroups() { var stream = new MemoryStream(); var schema = SchemaReflector.Reflect <TwoColumn>(); using (var parquetWriter = new ParquetWriter(schema, stream)) { using (var rowGroup = parquetWriter.CreateRowGroup()) { rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[0], new[] { 1, 2, 3, 4 })); rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[1], new[] { "one", "two", "three", "four" })); } using (var rowGroup = parquetWriter.CreateRowGroup()) { rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[0], new[] { 5, 6, 7, 8 })); rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[1], new[] { "five", "six", "seven", "eight" })); } using (var rowGroup = parquetWriter.CreateRowGroup()) { rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[0], new[] { 9, 10, 11, 12 })); rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[1], new[] { "nine", "ten", "eleven", "twelve" })); } } stream.Position = 0; return(stream); }
/// <summary> /// Serialises a collection of classes into a Parquet stream /// </summary> /// <typeparam name="T">Class type</typeparam> /// <param name="objectInstances">Collection of classes</param> /// <param name="destination">Destination stream</param> /// <param name="schema">Optional schema to use. When not specified the class schema will be discovered and everything possible will be /// written to the stream. If you want to write only a subset of class properties please specify the schema yourself. /// </param> /// <param name="compressionMethod"><see cref="CompressionMethod"/></param> /// <param name="rowGroupSize"></param> /// <param name="append"></param> /// <returns></returns> public static Schema Serialize <T>(IEnumerable <T> objectInstances, Stream destination, Schema schema = null, CompressionMethod compressionMethod = CompressionMethod.Snappy, int rowGroupSize = 5000, bool append = false) where T : new() { if (objectInstances == null) { throw new ArgumentNullException(nameof(objectInstances)); } if (destination == null) { throw new ArgumentNullException(nameof(destination)); } if (!destination.CanWrite) { throw new ArgumentException("stream must be writeable", nameof(destination)); } //if schema is not passed reflect it if (schema == null) { schema = SchemaReflector.Reflect <T>(); } using (var writer = new ParquetWriter(schema, destination, append: append)) { writer.CompressionMethod = compressionMethod; DataField[] dataFields = schema.GetDataFields(); foreach (IEnumerable <T> batch in objectInstances.Batch(rowGroupSize)) { var bridge = new ClrBridge(typeof(T)); T[] batchArray = batch.ToArray(); DataColumn[] columns = dataFields .Select(df => bridge.BuildColumn(df, batchArray, batchArray.Length)) .ToArray(); using (ParquetRowGroupWriter groupWriter = writer.CreateRowGroup()) { foreach (DataColumn dataColumn in columns) { groupWriter.WriteColumn(dataColumn); } } } } return(schema); }
public static Schema Serialize <T>(IEnumerable <T> objectInstances, Stream destination, Schema schema = null, WriterOptions writerOptions = null, CompressionMethod compressionMethod = CompressionMethod.Snappy) where T : new() { if (objectInstances == null) { throw new ArgumentNullException(nameof(objectInstances)); } if (destination == null) { throw new ArgumentNullException(nameof(destination)); } if (!destination.CanWrite) { throw new ArgumentException("stream must be writeable", nameof(destination)); } //if schema is not passed reflect it if (schema == null) { schema = SchemaReflector.Reflect <T>(); } if (writerOptions == null) { writerOptions = new WriterOptions(); } var extractor = new ColumnExtractor(); using (var writer = new ParquetWriter3(schema, destination, writerOptions: writerOptions)) { writer.CompressionMethod = compressionMethod; foreach (IEnumerable <T> batch in objectInstances.Batch(writerOptions.RowGroupsSize)) { IReadOnlyCollection <DataColumn> columns = extractor.ExtractColumns(batch, schema); using (ParquetRowGroupWriter groupWriter = writer.CreateRowGroup(batch.Count())) { foreach (DataColumn dataColumn in columns) { groupWriter.Write(dataColumn); } } } } return(schema); }
void TestRoundTripSerialization <T>(T value) { StructureWithTestType <T> input = new StructureWithTestType <T> { Id = "1", TestValue = value, }; Schema schema = SchemaReflector.Reflect <StructureWithTestType <T> >(); using (MemoryStream stream = new MemoryStream()) { ParquetConvert.Serialize <StructureWithTestType <T> >(new StructureWithTestType <T>[] { input }, stream, schema); stream.Position = 0; StructureWithTestType <T>[] output = ParquetConvert.Deserialize <StructureWithTestType <T> >(stream); Assert.Single(output); Assert.Equal("1", output[0].Id); Assert.Equal(value, output[0].TestValue); } }
public void Extract_simple_columns() { Schema schema = new SchemaReflector(typeof(SimpleColumns)).Reflect(); var extractor = new ColumnExtractor(); SimpleColumns[] classes = new[] { new SimpleColumns { Id = 1, Name = "First" }, new SimpleColumns { Id = 2, Name = "Second" }, new SimpleColumns { Id = 3, Name = "Third" } }; List <DataColumn> columns = extractor.ExtractColumns(classes, schema); Assert.Equal(new[] { 1, 2, 3 }, columns[0].DefinedData); Assert.Equal(new[] { "First", "Second", "Third" }, columns[1].DefinedData); }
public void I_can_infer_different_types() { var inferrer = new SchemaReflector(typeof(PocoClass)); Schema schema = inferrer.Reflect(); Assert.NotNull(schema); Assert.Equal(4, schema.Length); DataField id = (DataField)schema[0]; Assert.Equal("Id", id.Name); Assert.Equal(DataType.Int32, id.DataType); Assert.False(id.HasNulls); Assert.False(id.IsArray); DataField altId = (DataField)schema[1]; Assert.Equal("AltId", altId.Name); Assert.Equal(DataType.Int32, id.DataType); Assert.False(id.HasNulls); Assert.False(id.IsArray); DataField nullableFloat = (DataField)schema[2]; Assert.Equal("NullableFloat", nullableFloat.Name); Assert.Equal(DataType.Float, nullableFloat.DataType); Assert.True(nullableFloat.HasNulls); Assert.False(nullableFloat.IsArray); DataField intArray = (DataField)schema[3]; Assert.Equal("IntArray", intArray.Name); Assert.Equal(DataType.Int32, intArray.DataType); Assert.False(intArray.HasNulls); Assert.True(intArray.IsArray); }
/// <summary> /// Uploads the finished query data to S3 /// </summary> /// <param name="finishedQueries"></param> /// <param name="context"></param> /// <returns></returns> private static async Task WriteDataAsync(IEnumerable <AthenaQueryMetric> finishedQueries, string bucket, string format, ILambdaContext context) { if (finishedQueries == null) { throw new ArgumentNullException("finishedQueries"); } if (String.IsNullOrEmpty(bucket)) { throw new ArgumentNullException("bucket"); } if (context == null) { throw new ArgumentNullException("context"); } foreach (IGrouping <string, AthenaQueryMetric> Group in finishedQueries.GroupBy(x => x.BillingPeriod)) { // Maintains all of the disposables that need to be disposed of at the end, but // not before the streams have been completely read and uploaded, otherwise, it causes // a race condition if we use a using block where the streams will close before the // transfer utility has finished the upload List <IDisposable> Disposables = new List <IDisposable>(); // The memory stream the compressed stream will be written into MemoryStream MStreamOut = new MemoryStream(); Disposables.Add(MStreamOut); try { switch (format) { default: case "csv": { // The Gzip Stream only writes its file footer 10 byte data when the stream is closed // Calling dispose via the using block flushes and closes the stream first causing the // the footer data to be written out to the memory stream. The third parameter "true" // allows the memorystream to still access the gzip stream data, otherwise when trying to // upload the stream via the transfer utility, it will cause an exception that the stream // is closed using (GZipStream Gzip = new GZipStream(MStreamOut, CompressionLevel.Optimal, true)) { TextWriter TWriter = new StreamWriter(Gzip); CsvWriter Writer = new CsvWriter(TWriter); Writer.Configuration.RegisterClassMap <AthenaQueryMetricCsvMapping>(); Disposables.Add(Writer); Disposables.Add(TWriter); Writer.WriteHeader <AthenaQueryMetric>(); Writer.NextRecord(); // Advance the writer to the next line before // writing the records Writer.WriteRecords <AthenaQueryMetric>(finishedQueries); // Make sure to flush all of the data to the stream Writer.Flush(); TWriter.Flush(); } break; } case "parquet": { Schema PSchema = SchemaReflector.Reflect <AthenaQueryMetric>(); //ParquetConvert.Serialize<AthenaQueryMetric>(finishedQueries, MStreamOut, PSchema); break; } } // Make the transfer utility request to post the query data csv content TransferUtilityUploadRequest Request = new TransferUtilityUploadRequest() { BucketName = bucket, Key = $"data/billingperiod={Group.Key}/{finishedQueries.First().QueryExecutionId}_{finishedQueries.Last().QueryExecutionId}.csv.gz", InputStream = MStreamOut, AutoResetStreamPosition = true, AutoCloseStream = true, ContentType = "text/csv" }; using (TransferUtility XferUtil = new TransferUtility(_S3Client)) { try { context.LogInfo($"Starting file upload of {MStreamOut.Length} bytes: {Request.Key}."); // Make the upload await XferUtil.UploadAsync(Request); context.LogInfo($"Finished upload of {Request.Key}."); } catch (Exception e) { string Message = $"Failed to upload data file to s3://{Request.BucketName}/{Request.Key}."; context.LogError(Message, e); await SNSNotify(e, Message, context); } } } catch (Exception e) { context.LogError(e); await SNSNotify(e, context); } finally { // Dispose all of the streams and writers used to // write the CSV content, we need to dispose of these here // so the memory stream doesn't get closed by disposing // of the writers too early, which will cause the transfer utility // to fail the upload foreach (IDisposable Item in Disposables) { try { Item.Dispose(); } catch { } } // Make sure memory is cleaned up GC.Collect(); GC.WaitForPendingFinalizers(); } } }