public void Serialise_deserialise_all_types() { DateTime now = DateTime.Now; IEnumerable <SimpleStructure> structures = Enumerable .Range(0, 10) .Select(i => new SimpleStructure { Id = i, NullableId = (i % 2 == 0) ? new int?() : new int?(i), Name = $"row {i}", Date = now.AddDays(i).RoundToSecond().ToUniversalTime() }); using (var ms = new MemoryStream()) { Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2); ms.Position = 0; SimpleStructure[] structures2 = ParquetConvert.Deserialize <SimpleStructure>(ms); SimpleStructure[] structuresArray = structures.ToArray(); for (int i = 0; i < 10; i++) { Assert.Equal(structuresArray[i].Id, structures2[i].Id); Assert.Equal(structuresArray[i].NullableId, structures2[i].NullableId); Assert.Equal(structuresArray[i].Name, structures2[i].Name); Assert.Equal(structuresArray[i].Date, structures2[i].Date); } } }
public void Serialise_deserialise_renamed_column() { IEnumerable <SimpleRenamed> structures = Enumerable .Range(0, 10) .Select(i => new SimpleRenamed { Id = i, PersonName = $"row {i}" }); using (var ms = new MemoryStream()) { Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2); ms.Position = 0; SimpleRenamed[] structures2 = ParquetConvert.Deserialize <SimpleRenamed>(ms); SimpleRenamed[] structuresArray = structures.ToArray(); for (int i = 0; i < 10; i++) { Assert.Equal(structuresArray[i].Id, structures2[i].Id); Assert.Equal(structuresArray[i].PersonName, structures2[i].PersonName); } } }
public static async Task <IList <T> > ParquetDeserializeAsync <T>(this Stream stream, CancellationToken cancellationToken, int?bufferSize = null) where T : new() { // ReSharper disable once AccessToDisposedClosure -- use is awaited using (var s = await stream.AsSeekableAsync(cancellationToken, bufferSize).ConfigureAwait(false)) return(await Task.Run(() => ParquetConvert.Deserialize <T>(s), cancellationToken).ConfigureAwait(false)); }
protected T[] ConvertSerialiseDeserialise <T>(IEnumerable <T> instances) where T : new() { using (var ms = new MemoryStream()) { Schema s = ParquetConvert.Serialize <T>(instances, ms); ms.Position = 0; return(ParquetConvert.Deserialize <T>(ms)); } }
public void Serialise_Should_Exclude_IgnoredProperties_while_serialized_to_parquetfile() { DateTime now = DateTime.Now; IEnumerable <StructureWithIgnoredProperties> structures = Enumerable .Range(0, 10) .Select(i => new StructureWithIgnoredProperties { Id = i, Name = $"row {i}", SSN = "000-00-0000", NonNullableDecimal = 100.534M, NullableDecimal = 99.99M, NonNullableDateTime = DateTime.Now, NullableDateTime = DateTime.Now, NullableInt = 111, NonNullableInt = 222 }); using (var ms = new MemoryStream()) { Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2); ms.Position = 0; StructureWithIgnoredProperties[] structures2 = ParquetConvert.Deserialize <StructureWithIgnoredProperties>(ms); StructureWithIgnoredProperties[] structuresArray = structures.ToArray(); Func <Type, Object> GetDefaultValue = (type) => type.IsValueType ? Activator.CreateInstance(type) : null; for (int i = 0; i < 10; i++) { Assert.Equal(structuresArray[i].Id, structures2[i].Id); Assert.Equal(structuresArray[i].Name, structures2[i].Name); //As serialization ignored these below properties, deserilizing these should always be null(or type's default value). Assert.Equal(structures2[i].SSN, GetDefaultValue(typeof(string))); Assert.Equal(structures2[i].NonNullableInt, GetDefaultValue(typeof(int))); Assert.Equal(structures2[i].NullableInt, GetDefaultValue(typeof(int?))); Assert.Equal(structures2[i].NonNullableDecimal, GetDefaultValue(typeof(decimal))); Assert.Equal(structures2[i].NullableDecimal, GetDefaultValue(typeof(decimal?))); Assert.Equal(structures2[i].NonNullableDateTime, GetDefaultValue(typeof(DateTime))); Assert.Equal(structures2[i].NullableDateTime, GetDefaultValue(typeof(DateTime?))); } } }
public void SimpleParquetSerializationWorks() { var t = DateTimeOffset.UtcNow.TruncateTo(TimeSpan.FromSeconds(1)); var src = Enumerable.Range(0, 2).Select(x => new Installation(Guid.NewGuid(), $"I{x}", t, t.Add(TimeSpan.FromHours(1)))) .ToArray(); using (var ms = new MemoryStream()) { ParquetConvert.Serialize(src, ms); ms.Seek(0, SeekOrigin.Begin); var got = ParquetConvert.Deserialize <Installation>(ms); got.Should().BeEquivalentTo(src, cfg => cfg.WithStrictOrdering() // Currently Guid is not serialized :( .Excluding(x => x.Id)); } }
void TestRoundTripSerialization <T>(T value) { StructureWithTestType <T> input = new StructureWithTestType <T> { Id = "1", TestValue = value, }; Schema schema = SchemaReflector.Reflect <StructureWithTestType <T> >(); using (MemoryStream stream = new MemoryStream()) { ParquetConvert.Serialize <StructureWithTestType <T> >(new StructureWithTestType <T>[] { input }, stream, schema); stream.Position = 0; StructureWithTestType <T>[] output = ParquetConvert.Deserialize <StructureWithTestType <T> >(stream); Assert.Single(output); Assert.Equal("1", output[0].Id); Assert.Equal(value, output[0].TestValue); } }
public void Serialise_read_and_deserialise_by_rowgroup() { DateTime now = DateTime.Now; IEnumerable <SimpleStructure> structures = Enumerable .Range(0, 10) .Select(i => new SimpleStructure { Id = i, NullableId = (i % 2 == 0) ? new int?() : new int?(i), Name = $"row {i}", Date = now.AddDays(i).RoundToSecond().ToUniversalTime() }); using (var ms = new MemoryStream()) { Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2); ms.Position = 0; SimpleStructure[] structuresArray = structures.ToArray(); int rowGroupCount = 5; //based on our test input. 10 records with rowgroup size 2. for (int r = 0; r < rowGroupCount; r++) { SimpleStructure[] rowGroupRecords = ParquetConvert.Deserialize <SimpleStructure>(ms, rowGroupIndex: r); Assert.Equal(2, rowGroupRecords.Length); Assert.Equal(structuresArray[2 * r].Id, rowGroupRecords[0].Id); Assert.Equal(structuresArray[2 * r].NullableId, rowGroupRecords[0].NullableId); Assert.Equal(structuresArray[2 * r].Name, rowGroupRecords[0].Name); Assert.Equal(structuresArray[2 * r].Date, rowGroupRecords[0].Date); Assert.Equal(structuresArray[2 * r + 1].Id, rowGroupRecords[1].Id); Assert.Equal(structuresArray[2 * r + 1].NullableId, rowGroupRecords[1].NullableId); Assert.Equal(structuresArray[2 * r + 1].Name, rowGroupRecords[1].Name); Assert.Equal(structuresArray[2 * r + 1].Date, rowGroupRecords[1].Date); } Assert.Throws <ArgumentOutOfRangeException>("index", () => ParquetConvert.Deserialize <SimpleStructure>(ms, 5)); Assert.Throws <ArgumentOutOfRangeException>("index", () => ParquetConvert.Deserialize <SimpleStructure>(ms, 99999)); } }
/// <summary> /// Provid an IEnumeragle interface to the Parquet.Net Deserialization of Data for efficient /// processinga with support for Linq processing if/when needed. /// WARNING: Care must be taken to prevent Multiple Enumerations unnecessarily such as being sure /// to project filtered results into a List (e.g. ToList())! /// </summary> /// <typeparam name="T"></typeparam> /// <returns></returns> public IEnumerable <T> Read <T>() where T : new() { AssertParquetReaderIsOpen(); //It seemst hat the only API that works consistently is this one whereby we loop the RowGroups ourselves // but this also provies a little more control over the IEnumerable processing by not forcing all rows & row-groups // to be in memory at one time. for (int g = 0; g < _parquetReader.RowGroupCount; g++) { LogDebug($"Enumerating over RowGroup #[{g}]..."); var timer = Stopwatch.StartNew(); var group = ParquetConvert.Deserialize <T>(_blobStream, g); timer.Stop(); LogDebug($"Deserialized RowGroup [{g}] from the Stream in [{timer.ToElapsedTimeDescriptiveFormat()}]."); foreach (var item in group) { yield return(item); } } }
public async Task OpenVideos() { using (var s = File.OpenRead("C:\\Users\\mark\\Downloads\\Videos.0.parquet")) { var rows = ParquetConvert.Deserialize <VideoRow>(s); } }
public static async Task <IActionResult> Run( [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post", Route = null)] HttpRequest req, ILogger log) { log.LogInformation("C# HTTP trigger function processed a request."); // Connect to Blob Storage using a connection string // Assumes there is a container called 'files' string connectionString = "<CONNECTION STRING>"; BlobContainerClient container = new BlobContainerClient(connectionString, "files"); // Initialise our list of objects List <UserData> userDataList = new List <UserData>(); // Loop through each of the files in the container foreach (var file in container.GetBlobs()) { // Initialise a BlobClient so we can worj with the named file var blockBlob = container.GetBlobClient(file.Name); // Initialise a unique temporary path to hold the Parquet file var tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString()); // Download the file to the temporary path using (var fileStream = File.OpenWrite(tempPath + blockBlob.Name)) { blockBlob.DownloadTo(fileStream); } // Open file stream using (Stream fileStream = File.OpenRead(tempPath + blockBlob.Name)) { // Open parquet file reader using (var parquetReader = new ParquetReader(fileStream)) { // Enumerate through row groups in this file for (int i = 0; i < parquetReader.RowGroupCount; i++) { // Create row group reader using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i)) { // Deserialize the rows into an array UserData[] v1structures = ParquetConvert.Deserialize <UserData>(fileStream, i); // Add each item of our array to our overall list foreach (var row in v1structures) { userDataList.Add(row); } } } } } } // Check we have rows if (userDataList.Count > 0) { // Return the list as a JSON array string json = JsonConvert.SerializeObject(userDataList.ToArray(), Formatting.Indented); return(new OkObjectResult(json)); } else { return(new BadRequestResult()); } }