예제 #1
0
        public void Extract_array_columns()
        {
            Schema schema = SchemaReflector.Reflect <ArrayColumns>();

            Assert.Equal(2, schema.Length);
            var extractor = new ColumnExtractor();

            ArrayColumns[] ac =
            {
                new ArrayColumns
                {
                    Id        = 1,
                    Addresses = new[]{ "Fiddler",                       "On"   }
                },
                new ArrayColumns
                {
                    Id        = 2,
                    Addresses = new[]{ "The",                           "Roof" }
                }
            };

            List <DataColumn> columns = extractor.ExtractColumns(ac, schema).ToList();

            Assert.Equal(new[] { 1, 2 }, columns[0].DefinedData);

            Assert.Equal(new[] { "Fiddler", "On", "The", "Roof" }, columns[1].DefinedData);
            Assert.Equal(new[] { 0, 1, 0, 1 }, columns[1].RepetitionLevels);
        }
예제 #2
0
        /// <summary>
        ///
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="input"></param>
        /// <param name="rowGroupIndex"></param>
        /// <returns></returns>
        public static T[] Deserialize <T>(Stream input, int rowGroupIndex = -1) where T : new()
        {
            var result = new List <T>();

            using (var reader = new ParquetReader(input))
            {
                Schema      fileSchema = new SchemaReflector(typeof(T)).Reflect();
                DataField[] dataFields = fileSchema.GetDataFields();

                if (rowGroupIndex == -1) //Means read all row groups.
                {
                    for (int i = 0; i < reader.RowGroupCount; i++)
                    {
                        T[] currentRowGroupRecords = ReadAndDeserializeByRowGroup <T>(i, reader, dataFields);
                        result.AddRange(currentRowGroupRecords);
                    }
                }
                else //read specific rowgroup.
                {
                    T[] currentRowGroupRecords = ReadAndDeserializeByRowGroup <T>(rowGroupIndex, reader, dataFields);
                    result.AddRange(currentRowGroupRecords);
                }
            }
            return(result.ToArray());
        }
예제 #3
0
        public AssignArrayDelegate GenerateAssigner(DataColumn dataColumn, Type classType)
        {
            DataField fileField  = dataColumn.Field;
            Schema    typeSchema = SchemaReflector.Reflect(classType);
            DataField typeField  = typeSchema.FindDataField(fileField.Path);

            Type[] methodArgs = { typeof(DataColumn), typeof(Array) };
            var    runMethod  = new DynamicMethod(
                $"Set{classType.Name}{typeField.ClrPropName}",
                typeof(void),
                methodArgs,
                GetType().GetTypeInfo().Module);

            ILGenerator il = runMethod.GetILGenerator();

            //set class property method
            TypeInfo     ti             = classType.GetTypeInfo();
            PropertyInfo pi             = ti.GetDeclaredProperty(typeField.ClrPropName ?? typeField.Name);
            MethodInfo   setValueMethod = pi.SetMethod;

            TypeInfo   dcti          = dataColumn.GetType().GetTypeInfo();
            MethodInfo getDataMethod = dcti.GetDeclaredProperty(nameof(DataColumn.Data)).GetMethod;
            MethodInfo getRepsMethod = dcti.GetDeclaredProperty(nameof(DataColumn.RepetitionLevels)).GetMethod;

            TypeConversion conversion = GetConversion(dataColumn.Field.ClrNullableIfHasNullsType, pi.PropertyType);

            GenerateAssigner(il, classType, typeField,
                             setValueMethod,
                             getDataMethod,
                             getRepsMethod,
                             conversion);

            return((AssignArrayDelegate)runMethod.CreateDelegate(typeof(AssignArrayDelegate)));
        }
예제 #4
0
        public static Stream GetParquetFileWithThreeRowGroups()
        {
            var stream = new MemoryStream();
            var schema = SchemaReflector.Reflect <TwoColumn>();

            using (var parquetWriter = new ParquetWriter(schema, stream))
            {
                using (var rowGroup = parquetWriter.CreateRowGroup())
                {
                    rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[0], new[] {
                        1,
                        2,
                        3,
                        4
                    }));
                    rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[1], new[] {
                        "one",
                        "two",
                        "three",
                        "four"
                    }));
                }

                using (var rowGroup = parquetWriter.CreateRowGroup())
                {
                    rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[0], new[] {
                        5,
                        6,
                        7,
                        8
                    }));
                    rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[1], new[] {
                        "five",
                        "six",
                        "seven",
                        "eight"
                    }));
                }

                using (var rowGroup = parquetWriter.CreateRowGroup())
                {
                    rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[0], new[] {
                        9,
                        10,
                        11,
                        12
                    }));
                    rowGroup.WriteColumn(new Parquet.Data.DataColumn((DataField)schema.Fields[1], new[] {
                        "nine",
                        "ten",
                        "eleven",
                        "twelve"
                    }));
                }
            }

            stream.Position = 0;
            return(stream);
        }
예제 #5
0
        /// <summary>
        /// Serialises a collection of classes into a Parquet stream
        /// </summary>
        /// <typeparam name="T">Class type</typeparam>
        /// <param name="objectInstances">Collection of classes</param>
        /// <param name="destination">Destination stream</param>
        /// <param name="schema">Optional schema to use. When not specified the class schema will be discovered and everything possible will be
        /// written to the stream. If you want to write only a subset of class properties please specify the schema yourself.
        /// </param>
        /// <param name="compressionMethod"><see cref="CompressionMethod"/></param>
        /// <param name="rowGroupSize"></param>
        /// <param name="append"></param>
        /// <returns></returns>
        public static Schema Serialize <T>(IEnumerable <T> objectInstances, Stream destination,
                                           Schema schema = null,
                                           CompressionMethod compressionMethod = CompressionMethod.Snappy,
                                           int rowGroupSize = 5000,
                                           bool append      = false)
            where T : new()
        {
            if (objectInstances == null)
            {
                throw new ArgumentNullException(nameof(objectInstances));
            }
            if (destination == null)
            {
                throw new ArgumentNullException(nameof(destination));
            }
            if (!destination.CanWrite)
            {
                throw new ArgumentException("stream must be writeable", nameof(destination));
            }

            //if schema is not passed reflect it
            if (schema == null)
            {
                schema = SchemaReflector.Reflect <T>();
            }

            using (var writer = new ParquetWriter(schema, destination, append: append))
            {
                writer.CompressionMethod = compressionMethod;

                DataField[] dataFields = schema.GetDataFields();

                foreach (IEnumerable <T> batch in objectInstances.Batch(rowGroupSize))
                {
                    var bridge     = new ClrBridge(typeof(T));
                    T[] batchArray = batch.ToArray();

                    DataColumn[] columns = dataFields
                                           .Select(df => bridge.BuildColumn(df, batchArray, batchArray.Length))
                                           .ToArray();

                    using (ParquetRowGroupWriter groupWriter = writer.CreateRowGroup())
                    {
                        foreach (DataColumn dataColumn in columns)
                        {
                            groupWriter.WriteColumn(dataColumn);
                        }
                    }
                }
            }

            return(schema);
        }
예제 #6
0
        public static Schema Serialize <T>(IEnumerable <T> objectInstances, Stream destination,
                                           Schema schema = null,
                                           WriterOptions writerOptions         = null,
                                           CompressionMethod compressionMethod = CompressionMethod.Snappy)
            where T : new()
        {
            if (objectInstances == null)
            {
                throw new ArgumentNullException(nameof(objectInstances));
            }
            if (destination == null)
            {
                throw new ArgumentNullException(nameof(destination));
            }
            if (!destination.CanWrite)
            {
                throw new ArgumentException("stream must be writeable", nameof(destination));
            }

            //if schema is not passed reflect it
            if (schema == null)
            {
                schema = SchemaReflector.Reflect <T>();
            }

            if (writerOptions == null)
            {
                writerOptions = new WriterOptions();
            }

            var extractor = new ColumnExtractor();

            using (var writer = new ParquetWriter3(schema, destination, writerOptions: writerOptions))
            {
                writer.CompressionMethod = compressionMethod;

                foreach (IEnumerable <T> batch in objectInstances.Batch(writerOptions.RowGroupsSize))
                {
                    IReadOnlyCollection <DataColumn> columns = extractor.ExtractColumns(batch, schema);

                    using (ParquetRowGroupWriter groupWriter = writer.CreateRowGroup(batch.Count()))
                    {
                        foreach (DataColumn dataColumn in columns)
                        {
                            groupWriter.Write(dataColumn);
                        }
                    }
                }
            }

            return(schema);
        }
예제 #7
0
        void TestRoundTripSerialization <T>(T value)
        {
            StructureWithTestType <T> input = new StructureWithTestType <T>
            {
                Id        = "1",
                TestValue = value,
            };

            Schema schema = SchemaReflector.Reflect <StructureWithTestType <T> >();

            using (MemoryStream stream = new MemoryStream())
            {
                ParquetConvert.Serialize <StructureWithTestType <T> >(new StructureWithTestType <T>[] { input }, stream, schema);

                stream.Position = 0;
                StructureWithTestType <T>[] output = ParquetConvert.Deserialize <StructureWithTestType <T> >(stream);
                Assert.Single(output);
                Assert.Equal("1", output[0].Id);
                Assert.Equal(value, output[0].TestValue);
            }
        }
예제 #8
0
        public void Extract_simple_columns()
        {
            Schema schema    = new SchemaReflector(typeof(SimpleColumns)).Reflect();
            var    extractor = new ColumnExtractor();

            SimpleColumns[] classes = new[]
            {
                new SimpleColumns {
                    Id = 1, Name = "First"
                }, new SimpleColumns {
                    Id = 2, Name = "Second"
                }, new SimpleColumns {
                    Id = 3, Name = "Third"
                }
            };

            List <DataColumn> columns = extractor.ExtractColumns(classes, schema);

            Assert.Equal(new[] { 1, 2, 3 }, columns[0].DefinedData);
            Assert.Equal(new[] { "First", "Second", "Third" }, columns[1].DefinedData);
        }
예제 #9
0
        public void I_can_infer_different_types()
        {
            var inferrer = new SchemaReflector(typeof(PocoClass));

            Schema schema = inferrer.Reflect();

            Assert.NotNull(schema);
            Assert.Equal(4, schema.Length);

            DataField id = (DataField)schema[0];

            Assert.Equal("Id", id.Name);
            Assert.Equal(DataType.Int32, id.DataType);
            Assert.False(id.HasNulls);
            Assert.False(id.IsArray);

            DataField altId = (DataField)schema[1];

            Assert.Equal("AltId", altId.Name);
            Assert.Equal(DataType.Int32, id.DataType);
            Assert.False(id.HasNulls);
            Assert.False(id.IsArray);

            DataField nullableFloat = (DataField)schema[2];

            Assert.Equal("NullableFloat", nullableFloat.Name);
            Assert.Equal(DataType.Float, nullableFloat.DataType);
            Assert.True(nullableFloat.HasNulls);
            Assert.False(nullableFloat.IsArray);

            DataField intArray = (DataField)schema[3];

            Assert.Equal("IntArray", intArray.Name);
            Assert.Equal(DataType.Int32, intArray.DataType);
            Assert.False(intArray.HasNulls);
            Assert.True(intArray.IsArray);
        }
예제 #10
0
        /// <summary>
        /// Uploads the finished query data to S3
        /// </summary>
        /// <param name="finishedQueries"></param>
        /// <param name="context"></param>
        /// <returns></returns>
        private static async Task WriteDataAsync(IEnumerable <AthenaQueryMetric> finishedQueries, string bucket, string format, ILambdaContext context)
        {
            if (finishedQueries == null)
            {
                throw new ArgumentNullException("finishedQueries");
            }

            if (String.IsNullOrEmpty(bucket))
            {
                throw new ArgumentNullException("bucket");
            }

            if (context == null)
            {
                throw new ArgumentNullException("context");
            }

            foreach (IGrouping <string, AthenaQueryMetric> Group in finishedQueries.GroupBy(x => x.BillingPeriod))
            {
                // Maintains all of the disposables that need to be disposed of at the end, but
                // not before the streams have been completely read and uploaded, otherwise, it causes
                // a race condition if we use a using block where the streams will close before the
                // transfer utility has finished the upload
                List <IDisposable> Disposables = new List <IDisposable>();

                // The memory stream the compressed stream will be written into
                MemoryStream MStreamOut = new MemoryStream();
                Disposables.Add(MStreamOut);

                try
                {
                    switch (format)
                    {
                    default:
                    case "csv":
                    {
                        // The Gzip Stream only writes its file footer 10 byte data when the stream is closed
                        // Calling dispose via the using block flushes and closes the stream first causing the
                        // the footer data to be written out to the memory stream. The third parameter "true"
                        // allows the memorystream to still access the gzip stream data, otherwise when trying to
                        // upload the stream via the transfer utility, it will cause an exception that the stream
                        // is closed
                        using (GZipStream Gzip = new GZipStream(MStreamOut, CompressionLevel.Optimal, true))
                        {
                            TextWriter TWriter = new StreamWriter(Gzip);
                            CsvWriter  Writer  = new CsvWriter(TWriter);

                            Writer.Configuration.RegisterClassMap <AthenaQueryMetricCsvMapping>();

                            Disposables.Add(Writer);
                            Disposables.Add(TWriter);

                            Writer.WriteHeader <AthenaQueryMetric>();
                            Writer.NextRecord();         // Advance the writer to the next line before
                                                         // writing the records
                            Writer.WriteRecords <AthenaQueryMetric>(finishedQueries);

                            // Make sure to flush all of the data to the stream
                            Writer.Flush();
                            TWriter.Flush();
                        }

                        break;
                    }

                    case "parquet":
                    {
                        Schema PSchema = SchemaReflector.Reflect <AthenaQueryMetric>();

                        //ParquetConvert.Serialize<AthenaQueryMetric>(finishedQueries, MStreamOut, PSchema);


                        break;
                    }
                    }

                    // Make the transfer utility request to post the query data csv content
                    TransferUtilityUploadRequest Request = new TransferUtilityUploadRequest()
                    {
                        BucketName              = bucket,
                        Key                     = $"data/billingperiod={Group.Key}/{finishedQueries.First().QueryExecutionId}_{finishedQueries.Last().QueryExecutionId}.csv.gz",
                        InputStream             = MStreamOut,
                        AutoResetStreamPosition = true,
                        AutoCloseStream         = true,
                        ContentType             = "text/csv"
                    };

                    using (TransferUtility XferUtil = new TransferUtility(_S3Client))
                    {
                        try
                        {
                            context.LogInfo($"Starting file upload of {MStreamOut.Length} bytes: {Request.Key}.");
                            // Make the upload
                            await XferUtil.UploadAsync(Request);

                            context.LogInfo($"Finished upload of {Request.Key}.");
                        }
                        catch (Exception e)
                        {
                            string Message = $"Failed to upload data file to s3://{Request.BucketName}/{Request.Key}.";
                            context.LogError(Message, e);
                            await SNSNotify(e, Message, context);
                        }
                    }
                }
                catch (Exception e)
                {
                    context.LogError(e);
                    await SNSNotify(e, context);
                }
                finally
                {
                    // Dispose all of the streams and writers used to
                    // write the CSV content, we need to dispose of these here
                    // so the memory stream doesn't get closed by disposing
                    // of the writers too early, which will cause the transfer utility
                    // to fail the upload
                    foreach (IDisposable Item in Disposables)
                    {
                        try
                        {
                            Item.Dispose();
                        }
                        catch { }
                    }

                    // Make sure memory is cleaned up
                    GC.Collect();
                    GC.WaitForPendingFinalizers();
                }
            }
        }