Example #1
0
        public void Serialise_deserialise_all_types()
        {
            DateTime now = DateTime.Now;

            IEnumerable <SimpleStructure> structures = Enumerable
                                                       .Range(0, 10)
                                                       .Select(i => new SimpleStructure
            {
                Id         = i,
                NullableId = (i % 2 == 0) ? new int?() : new int?(i),
                Name       = $"row {i}",
                Date       = now.AddDays(i).RoundToSecond().ToUniversalTime()
            });

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2);

                ms.Position = 0;

                SimpleStructure[] structures2 = ParquetConvert.Deserialize <SimpleStructure>(ms);

                SimpleStructure[] structuresArray = structures.ToArray();
                for (int i = 0; i < 10; i++)
                {
                    Assert.Equal(structuresArray[i].Id, structures2[i].Id);
                    Assert.Equal(structuresArray[i].NullableId, structures2[i].NullableId);
                    Assert.Equal(structuresArray[i].Name, structures2[i].Name);
                    Assert.Equal(structuresArray[i].Date, structures2[i].Date);
                }
            }
        }
Example #2
0
        public void Serialise_single_row_group()
        {
            List <SimpleStructure> structures = Enumerable
                                                .Range(0, 10)
                                                .Select(i => new SimpleStructure {
                Id = i, Name = $"row {i}"
            })
                                                .ToList();

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.None);

                ms.Position = 0;

                using (var reader = new ParquetReader3(ms))
                {
                    Assert.Equal(1, reader.RowGroupCount);

                    using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0))
                    {
                        DataColumn ids   = rgr.ReadColumn(schema.DataFieldAt(0));
                        DataColumn names = rgr.ReadColumn(schema.DataFieldAt(1));

                        Assert.Equal(10, ids.TotalCount);
                        Assert.Equal(10, names.TotalCount);
                    }
                }
            }
        }
Example #3
0
        public void Serialise_deserialise_renamed_column()
        {
            IEnumerable <SimpleRenamed> structures = Enumerable
                                                     .Range(0, 10)
                                                     .Select(i => new SimpleRenamed
            {
                Id         = i,
                PersonName = $"row {i}"
            });

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2);

                ms.Position = 0;

                SimpleRenamed[] structures2 = ParquetConvert.Deserialize <SimpleRenamed>(ms);

                SimpleRenamed[] structuresArray = structures.ToArray();
                for (int i = 0; i < 10; i++)
                {
                    Assert.Equal(structuresArray[i].Id, structures2[i].Id);
                    Assert.Equal(structuresArray[i].PersonName, structures2[i].PersonName);
                }
            }
        }
        async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new()
        {
            var storeDir  = StringPath.Relative(dir);
            var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.parquet");

            ParquetConvert.Serialize(rows, localFile.FullPath);
            var storePath = storeDir.Add(localFile.FileName);
            await Store.Save(storePath, localFile);

            Log.Information("Saved {Path}", storePath);
        }
Example #5
0
        protected T[] ConvertSerialiseDeserialise <T>(IEnumerable <T> instances) where T : new()
        {
            using (var ms = new MemoryStream())
            {
                Schema s = ParquetConvert.Serialize <T>(instances, ms);

                ms.Position = 0;

                return(ParquetConvert.Deserialize <T>(ms));
            }
        }
Example #6
0
 async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new()
 {
     await rows.Chunk(200000).Select((r, i) => (chunkRows: r, index: i)).BlockTransform(async chunk => {
         var storeDir  = StringPath.Relative(dir);
         var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.{chunk.index}.parquet");
         ParquetConvert.Serialize(chunk.chunkRows, localFile.FullPath);
         var storePath = storeDir.Add(localFile.FileName);
         await Store.Save(storePath, localFile);
         Log.Information("Saved {Path}", storePath);
         return(storeDir);
     }, 4);
 }
Example #7
0
        public void Serialise_Should_Exclude_IgnoredProperties_while_serialized_to_parquetfile()
        {
            DateTime now = DateTime.Now;

            IEnumerable <StructureWithIgnoredProperties> structures = Enumerable
                                                                      .Range(0, 10)
                                                                      .Select(i => new StructureWithIgnoredProperties
            {
                Id   = i,
                Name = $"row {i}",
                SSN  = "000-00-0000",
                NonNullableDecimal  = 100.534M,
                NullableDecimal     = 99.99M,
                NonNullableDateTime = DateTime.Now,
                NullableDateTime    = DateTime.Now,
                NullableInt         = 111,
                NonNullableInt      = 222
            });

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2);

                ms.Position = 0;

                StructureWithIgnoredProperties[] structures2 = ParquetConvert.Deserialize <StructureWithIgnoredProperties>(ms);

                StructureWithIgnoredProperties[] structuresArray = structures.ToArray();
                Func <Type, Object> GetDefaultValue = (type) => type.IsValueType ? Activator.CreateInstance(type) : null;

                for (int i = 0; i < 10; i++)
                {
                    Assert.Equal(structuresArray[i].Id, structures2[i].Id);
                    Assert.Equal(structuresArray[i].Name, structures2[i].Name);
                    //As serialization ignored these below properties, deserilizing these should always be null(or type's default value).
                    Assert.Equal(structures2[i].SSN, GetDefaultValue(typeof(string)));
                    Assert.Equal(structures2[i].NonNullableInt, GetDefaultValue(typeof(int)));
                    Assert.Equal(structures2[i].NullableInt, GetDefaultValue(typeof(int?)));
                    Assert.Equal(structures2[i].NonNullableDecimal, GetDefaultValue(typeof(decimal)));
                    Assert.Equal(structures2[i].NullableDecimal, GetDefaultValue(typeof(decimal?)));
                    Assert.Equal(structures2[i].NonNullableDateTime, GetDefaultValue(typeof(DateTime)));
                    Assert.Equal(structures2[i].NullableDateTime, GetDefaultValue(typeof(DateTime?)));
                }
            }
        }
        public void SimpleParquetSerializationWorks()
        {
            var t   = DateTimeOffset.UtcNow.TruncateTo(TimeSpan.FromSeconds(1));
            var src = Enumerable.Range(0, 2).Select(x =>
                                                    new Installation(Guid.NewGuid(), $"I{x}",
                                                                     t,
                                                                     t.Add(TimeSpan.FromHours(1))))
                      .ToArray();

            using (var ms = new MemoryStream())
            {
                ParquetConvert.Serialize(src, ms);
                ms.Seek(0, SeekOrigin.Begin);
                var got = ParquetConvert.Deserialize <Installation>(ms);
                got.Should().BeEquivalentTo(src, cfg => cfg.WithStrictOrdering()
                                            // Currently Guid is not serialized :(
                                            .Excluding(x => x.Id));
            }
        }
Example #9
0
        void TestRoundTripSerialization <T>(T value)
        {
            StructureWithTestType <T> input = new StructureWithTestType <T>
            {
                Id        = "1",
                TestValue = value,
            };

            Schema schema = SchemaReflector.Reflect <StructureWithTestType <T> >();

            using (MemoryStream stream = new MemoryStream())
            {
                ParquetConvert.Serialize <StructureWithTestType <T> >(new StructureWithTestType <T>[] { input }, stream, schema);

                stream.Position = 0;
                StructureWithTestType <T>[] output = ParquetConvert.Deserialize <StructureWithTestType <T> >(stream);
                Assert.Single(output);
                Assert.Equal("1", output[0].Id);
                Assert.Equal(value, output[0].TestValue);
            }
        }
Example #10
0
        public void Serialise_read_and_deserialise_by_rowgroup()
        {
            DateTime now = DateTime.Now;

            IEnumerable <SimpleStructure> structures = Enumerable
                                                       .Range(0, 10)
                                                       .Select(i => new SimpleStructure
            {
                Id         = i,
                NullableId = (i % 2 == 0) ? new int?() : new int?(i),
                Name       = $"row {i}",
                Date       = now.AddDays(i).RoundToSecond().ToUniversalTime()
            });

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2);

                ms.Position = 0;

                SimpleStructure[] structuresArray = structures.ToArray();
                int rowGroupCount = 5; //based on our test input. 10 records with rowgroup size 2.
                for (int r = 0; r < rowGroupCount; r++)
                {
                    SimpleStructure[] rowGroupRecords = ParquetConvert.Deserialize <SimpleStructure>(ms, rowGroupIndex: r);
                    Assert.Equal(2, rowGroupRecords.Length);

                    Assert.Equal(structuresArray[2 * r].Id, rowGroupRecords[0].Id);
                    Assert.Equal(structuresArray[2 * r].NullableId, rowGroupRecords[0].NullableId);
                    Assert.Equal(structuresArray[2 * r].Name, rowGroupRecords[0].Name);
                    Assert.Equal(structuresArray[2 * r].Date, rowGroupRecords[0].Date);
                    Assert.Equal(structuresArray[2 * r + 1].Id, rowGroupRecords[1].Id);
                    Assert.Equal(structuresArray[2 * r + 1].NullableId, rowGroupRecords[1].NullableId);
                    Assert.Equal(structuresArray[2 * r + 1].Name, rowGroupRecords[1].Name);
                    Assert.Equal(structuresArray[2 * r + 1].Date, rowGroupRecords[1].Date);
                }
                Assert.Throws <ArgumentOutOfRangeException>("index", () => ParquetConvert.Deserialize <SimpleStructure>(ms, 5));
                Assert.Throws <ArgumentOutOfRangeException>("index", () => ParquetConvert.Deserialize <SimpleStructure>(ms, 99999));
            }
        }