public void Serialise_deserialise_all_types() { DateTime now = DateTime.Now; IEnumerable <SimpleStructure> structures = Enumerable .Range(0, 10) .Select(i => new SimpleStructure { Id = i, NullableId = (i % 2 == 0) ? new int?() : new int?(i), Name = $"row {i}", Date = now.AddDays(i).RoundToSecond().ToUniversalTime() }); using (var ms = new MemoryStream()) { Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2); ms.Position = 0; SimpleStructure[] structures2 = ParquetConvert.Deserialize <SimpleStructure>(ms); SimpleStructure[] structuresArray = structures.ToArray(); for (int i = 0; i < 10; i++) { Assert.Equal(structuresArray[i].Id, structures2[i].Id); Assert.Equal(structuresArray[i].NullableId, structures2[i].NullableId); Assert.Equal(structuresArray[i].Name, structures2[i].Name); Assert.Equal(structuresArray[i].Date, structures2[i].Date); } } }
public void Serialise_single_row_group() { List <SimpleStructure> structures = Enumerable .Range(0, 10) .Select(i => new SimpleStructure { Id = i, Name = $"row {i}" }) .ToList(); using (var ms = new MemoryStream()) { Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.None); ms.Position = 0; using (var reader = new ParquetReader3(ms)) { Assert.Equal(1, reader.RowGroupCount); using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0)) { DataColumn ids = rgr.ReadColumn(schema.DataFieldAt(0)); DataColumn names = rgr.ReadColumn(schema.DataFieldAt(1)); Assert.Equal(10, ids.TotalCount); Assert.Equal(10, names.TotalCount); } } } }
public void Serialise_deserialise_renamed_column() { IEnumerable <SimpleRenamed> structures = Enumerable .Range(0, 10) .Select(i => new SimpleRenamed { Id = i, PersonName = $"row {i}" }); using (var ms = new MemoryStream()) { Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2); ms.Position = 0; SimpleRenamed[] structures2 = ParquetConvert.Deserialize <SimpleRenamed>(ms); SimpleRenamed[] structuresArray = structures.ToArray(); for (int i = 0; i < 10; i++) { Assert.Equal(structuresArray[i].Id, structures2[i].Id); Assert.Equal(structuresArray[i].PersonName, structures2[i].PersonName); } } }
async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new() { var storeDir = StringPath.Relative(dir); var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.parquet"); ParquetConvert.Serialize(rows, localFile.FullPath); var storePath = storeDir.Add(localFile.FileName); await Store.Save(storePath, localFile); Log.Information("Saved {Path}", storePath); }
protected T[] ConvertSerialiseDeserialise <T>(IEnumerable <T> instances) where T : new() { using (var ms = new MemoryStream()) { Schema s = ParquetConvert.Serialize <T>(instances, ms); ms.Position = 0; return(ParquetConvert.Deserialize <T>(ms)); } }
async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new() { await rows.Chunk(200000).Select((r, i) => (chunkRows: r, index: i)).BlockTransform(async chunk => { var storeDir = StringPath.Relative(dir); var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.{chunk.index}.parquet"); ParquetConvert.Serialize(chunk.chunkRows, localFile.FullPath); var storePath = storeDir.Add(localFile.FileName); await Store.Save(storePath, localFile); Log.Information("Saved {Path}", storePath); return(storeDir); }, 4); }
public void Serialise_Should_Exclude_IgnoredProperties_while_serialized_to_parquetfile() { DateTime now = DateTime.Now; IEnumerable <StructureWithIgnoredProperties> structures = Enumerable .Range(0, 10) .Select(i => new StructureWithIgnoredProperties { Id = i, Name = $"row {i}", SSN = "000-00-0000", NonNullableDecimal = 100.534M, NullableDecimal = 99.99M, NonNullableDateTime = DateTime.Now, NullableDateTime = DateTime.Now, NullableInt = 111, NonNullableInt = 222 }); using (var ms = new MemoryStream()) { Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2); ms.Position = 0; StructureWithIgnoredProperties[] structures2 = ParquetConvert.Deserialize <StructureWithIgnoredProperties>(ms); StructureWithIgnoredProperties[] structuresArray = structures.ToArray(); Func <Type, Object> GetDefaultValue = (type) => type.IsValueType ? Activator.CreateInstance(type) : null; for (int i = 0; i < 10; i++) { Assert.Equal(structuresArray[i].Id, structures2[i].Id); Assert.Equal(structuresArray[i].Name, structures2[i].Name); //As serialization ignored these below properties, deserilizing these should always be null(or type's default value). Assert.Equal(structures2[i].SSN, GetDefaultValue(typeof(string))); Assert.Equal(structures2[i].NonNullableInt, GetDefaultValue(typeof(int))); Assert.Equal(structures2[i].NullableInt, GetDefaultValue(typeof(int?))); Assert.Equal(structures2[i].NonNullableDecimal, GetDefaultValue(typeof(decimal))); Assert.Equal(structures2[i].NullableDecimal, GetDefaultValue(typeof(decimal?))); Assert.Equal(structures2[i].NonNullableDateTime, GetDefaultValue(typeof(DateTime))); Assert.Equal(structures2[i].NullableDateTime, GetDefaultValue(typeof(DateTime?))); } } }
public void SimpleParquetSerializationWorks() { var t = DateTimeOffset.UtcNow.TruncateTo(TimeSpan.FromSeconds(1)); var src = Enumerable.Range(0, 2).Select(x => new Installation(Guid.NewGuid(), $"I{x}", t, t.Add(TimeSpan.FromHours(1)))) .ToArray(); using (var ms = new MemoryStream()) { ParquetConvert.Serialize(src, ms); ms.Seek(0, SeekOrigin.Begin); var got = ParquetConvert.Deserialize <Installation>(ms); got.Should().BeEquivalentTo(src, cfg => cfg.WithStrictOrdering() // Currently Guid is not serialized :( .Excluding(x => x.Id)); } }
void TestRoundTripSerialization <T>(T value) { StructureWithTestType <T> input = new StructureWithTestType <T> { Id = "1", TestValue = value, }; Schema schema = SchemaReflector.Reflect <StructureWithTestType <T> >(); using (MemoryStream stream = new MemoryStream()) { ParquetConvert.Serialize <StructureWithTestType <T> >(new StructureWithTestType <T>[] { input }, stream, schema); stream.Position = 0; StructureWithTestType <T>[] output = ParquetConvert.Deserialize <StructureWithTestType <T> >(stream); Assert.Single(output); Assert.Equal("1", output[0].Id); Assert.Equal(value, output[0].TestValue); } }
public void Serialise_read_and_deserialise_by_rowgroup() { DateTime now = DateTime.Now; IEnumerable <SimpleStructure> structures = Enumerable .Range(0, 10) .Select(i => new SimpleStructure { Id = i, NullableId = (i % 2 == 0) ? new int?() : new int?(i), Name = $"row {i}", Date = now.AddDays(i).RoundToSecond().ToUniversalTime() }); using (var ms = new MemoryStream()) { Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2); ms.Position = 0; SimpleStructure[] structuresArray = structures.ToArray(); int rowGroupCount = 5; //based on our test input. 10 records with rowgroup size 2. for (int r = 0; r < rowGroupCount; r++) { SimpleStructure[] rowGroupRecords = ParquetConvert.Deserialize <SimpleStructure>(ms, rowGroupIndex: r); Assert.Equal(2, rowGroupRecords.Length); Assert.Equal(structuresArray[2 * r].Id, rowGroupRecords[0].Id); Assert.Equal(structuresArray[2 * r].NullableId, rowGroupRecords[0].NullableId); Assert.Equal(structuresArray[2 * r].Name, rowGroupRecords[0].Name); Assert.Equal(structuresArray[2 * r].Date, rowGroupRecords[0].Date); Assert.Equal(structuresArray[2 * r + 1].Id, rowGroupRecords[1].Id); Assert.Equal(structuresArray[2 * r + 1].NullableId, rowGroupRecords[1].NullableId); Assert.Equal(structuresArray[2 * r + 1].Name, rowGroupRecords[1].Name); Assert.Equal(structuresArray[2 * r + 1].Date, rowGroupRecords[1].Date); } Assert.Throws <ArgumentOutOfRangeException>("index", () => ParquetConvert.Deserialize <SimpleStructure>(ms, 5)); Assert.Throws <ArgumentOutOfRangeException>("index", () => ParquetConvert.Deserialize <SimpleStructure>(ms, 99999)); } }