Ejemplo n.º 1
0
        public void Serialise_deserialise_all_types()
        {
            DateTime now = DateTime.Now;

            IEnumerable <SimpleStructure> structures = Enumerable
                                                       .Range(0, 10)
                                                       .Select(i => new SimpleStructure
            {
                Id         = i,
                NullableId = (i % 2 == 0) ? new int?() : new int?(i),
                Name       = $"row {i}",
                Date       = now.AddDays(i).RoundToSecond().ToUniversalTime()
            });

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2);

                ms.Position = 0;

                SimpleStructure[] structures2 = ParquetConvert.Deserialize <SimpleStructure>(ms);

                SimpleStructure[] structuresArray = structures.ToArray();
                for (int i = 0; i < 10; i++)
                {
                    Assert.Equal(structuresArray[i].Id, structures2[i].Id);
                    Assert.Equal(structuresArray[i].NullableId, structures2[i].NullableId);
                    Assert.Equal(structuresArray[i].Name, structures2[i].Name);
                    Assert.Equal(structuresArray[i].Date, structures2[i].Date);
                }
            }
        }
Ejemplo n.º 2
0
        public void Serialise_deserialise_renamed_column()
        {
            IEnumerable <SimpleRenamed> structures = Enumerable
                                                     .Range(0, 10)
                                                     .Select(i => new SimpleRenamed
            {
                Id         = i,
                PersonName = $"row {i}"
            });

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2);

                ms.Position = 0;

                SimpleRenamed[] structures2 = ParquetConvert.Deserialize <SimpleRenamed>(ms);

                SimpleRenamed[] structuresArray = structures.ToArray();
                for (int i = 0; i < 10; i++)
                {
                    Assert.Equal(structuresArray[i].Id, structures2[i].Id);
                    Assert.Equal(structuresArray[i].PersonName, structures2[i].PersonName);
                }
            }
        }
Ejemplo n.º 3
0
        public void Serialise_single_row_group()
        {
            List <SimpleStructure> structures = Enumerable
                                                .Range(0, 10)
                                                .Select(i => new SimpleStructure {
                Id = i, Name = $"row {i}"
            })
                                                .ToList();

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.None);

                ms.Position = 0;

                using (var reader = new ParquetReader3(ms))
                {
                    Assert.Equal(1, reader.RowGroupCount);

                    using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0))
                    {
                        DataColumn ids   = rgr.ReadColumn(schema.DataFieldAt(0));
                        DataColumn names = rgr.ReadColumn(schema.DataFieldAt(1));

                        Assert.Equal(10, ids.TotalCount);
                        Assert.Equal(10, names.TotalCount);
                    }
                }
            }
        }
Ejemplo n.º 4
0
 public static async Task <IList <T> > ParquetDeserializeAsync <T>(this Stream stream,
                                                                   CancellationToken cancellationToken,
                                                                   int?bufferSize = null)
     where T : new()
 {
     // ReSharper disable once AccessToDisposedClosure -- use is awaited
     using (var s = await stream.AsSeekableAsync(cancellationToken, bufferSize).ConfigureAwait(false))
         return(await Task.Run(() => ParquetConvert.Deserialize <T>(s), cancellationToken).ConfigureAwait(false));
 }
Ejemplo n.º 5
0
        async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new()
        {
            var storeDir  = StringPath.Relative(dir);
            var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.parquet");

            ParquetConvert.Serialize(rows, localFile.FullPath);
            var storePath = storeDir.Add(localFile.FileName);
            await Store.Save(storePath, localFile);

            Log.Information("Saved {Path}", storePath);
        }
Ejemplo n.º 6
0
        protected T[] ConvertSerialiseDeserialise <T>(IEnumerable <T> instances) where T : new()
        {
            using (var ms = new MemoryStream())
            {
                Schema s = ParquetConvert.Serialize <T>(instances, ms);

                ms.Position = 0;

                return(ParquetConvert.Deserialize <T>(ms));
            }
        }
Ejemplo n.º 7
0
 async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new()
 {
     await rows.Chunk(200000).Select((r, i) => (chunkRows: r, index: i)).BlockTransform(async chunk => {
         var storeDir  = StringPath.Relative(dir);
         var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.{chunk.index}.parquet");
         ParquetConvert.Serialize(chunk.chunkRows, localFile.FullPath);
         var storePath = storeDir.Add(localFile.FileName);
         await Store.Save(storePath, localFile);
         Log.Information("Saved {Path}", storePath);
         return(storeDir);
     }, 4);
 }
Ejemplo n.º 8
0
        public void Serialise_Should_Exclude_IgnoredProperties_while_serialized_to_parquetfile()
        {
            DateTime now = DateTime.Now;

            IEnumerable <StructureWithIgnoredProperties> structures = Enumerable
                                                                      .Range(0, 10)
                                                                      .Select(i => new StructureWithIgnoredProperties
            {
                Id   = i,
                Name = $"row {i}",
                SSN  = "000-00-0000",
                NonNullableDecimal  = 100.534M,
                NullableDecimal     = 99.99M,
                NonNullableDateTime = DateTime.Now,
                NullableDateTime    = DateTime.Now,
                NullableInt         = 111,
                NonNullableInt      = 222
            });

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2);

                ms.Position = 0;

                StructureWithIgnoredProperties[] structures2 = ParquetConvert.Deserialize <StructureWithIgnoredProperties>(ms);

                StructureWithIgnoredProperties[] structuresArray = structures.ToArray();
                Func <Type, Object> GetDefaultValue = (type) => type.IsValueType ? Activator.CreateInstance(type) : null;

                for (int i = 0; i < 10; i++)
                {
                    Assert.Equal(structuresArray[i].Id, structures2[i].Id);
                    Assert.Equal(structuresArray[i].Name, structures2[i].Name);
                    //As serialization ignored these below properties, deserilizing these should always be null(or type's default value).
                    Assert.Equal(structures2[i].SSN, GetDefaultValue(typeof(string)));
                    Assert.Equal(structures2[i].NonNullableInt, GetDefaultValue(typeof(int)));
                    Assert.Equal(structures2[i].NullableInt, GetDefaultValue(typeof(int?)));
                    Assert.Equal(structures2[i].NonNullableDecimal, GetDefaultValue(typeof(decimal)));
                    Assert.Equal(structures2[i].NullableDecimal, GetDefaultValue(typeof(decimal?)));
                    Assert.Equal(structures2[i].NonNullableDateTime, GetDefaultValue(typeof(DateTime)));
                    Assert.Equal(structures2[i].NullableDateTime, GetDefaultValue(typeof(DateTime?)));
                }
            }
        }
Ejemplo n.º 9
0
        public void SimpleParquetSerializationWorks()
        {
            var t   = DateTimeOffset.UtcNow.TruncateTo(TimeSpan.FromSeconds(1));
            var src = Enumerable.Range(0, 2).Select(x =>
                                                    new Installation(Guid.NewGuid(), $"I{x}",
                                                                     t,
                                                                     t.Add(TimeSpan.FromHours(1))))
                      .ToArray();

            using (var ms = new MemoryStream())
            {
                ParquetConvert.Serialize(src, ms);
                ms.Seek(0, SeekOrigin.Begin);
                var got = ParquetConvert.Deserialize <Installation>(ms);
                got.Should().BeEquivalentTo(src, cfg => cfg.WithStrictOrdering()
                                            // Currently Guid is not serialized :(
                                            .Excluding(x => x.Id));
            }
        }
Ejemplo n.º 10
0
        void TestRoundTripSerialization <T>(T value)
        {
            StructureWithTestType <T> input = new StructureWithTestType <T>
            {
                Id        = "1",
                TestValue = value,
            };

            Schema schema = SchemaReflector.Reflect <StructureWithTestType <T> >();

            using (MemoryStream stream = new MemoryStream())
            {
                ParquetConvert.Serialize <StructureWithTestType <T> >(new StructureWithTestType <T>[] { input }, stream, schema);

                stream.Position = 0;
                StructureWithTestType <T>[] output = ParquetConvert.Deserialize <StructureWithTestType <T> >(stream);
                Assert.Single(output);
                Assert.Equal("1", output[0].Id);
                Assert.Equal(value, output[0].TestValue);
            }
        }
Ejemplo n.º 11
0
        public void Serialise_read_and_deserialise_by_rowgroup()
        {
            DateTime now = DateTime.Now;

            IEnumerable <SimpleStructure> structures = Enumerable
                                                       .Range(0, 10)
                                                       .Select(i => new SimpleStructure
            {
                Id         = i,
                NullableId = (i % 2 == 0) ? new int?() : new int?(i),
                Name       = $"row {i}",
                Date       = now.AddDays(i).RoundToSecond().ToUniversalTime()
            });

            using (var ms = new MemoryStream())
            {
                Schema schema = ParquetConvert.Serialize(structures, ms, compressionMethod: CompressionMethod.Snappy, rowGroupSize: 2);

                ms.Position = 0;

                SimpleStructure[] structuresArray = structures.ToArray();
                int rowGroupCount = 5; //based on our test input. 10 records with rowgroup size 2.
                for (int r = 0; r < rowGroupCount; r++)
                {
                    SimpleStructure[] rowGroupRecords = ParquetConvert.Deserialize <SimpleStructure>(ms, rowGroupIndex: r);
                    Assert.Equal(2, rowGroupRecords.Length);

                    Assert.Equal(structuresArray[2 * r].Id, rowGroupRecords[0].Id);
                    Assert.Equal(structuresArray[2 * r].NullableId, rowGroupRecords[0].NullableId);
                    Assert.Equal(structuresArray[2 * r].Name, rowGroupRecords[0].Name);
                    Assert.Equal(structuresArray[2 * r].Date, rowGroupRecords[0].Date);
                    Assert.Equal(structuresArray[2 * r + 1].Id, rowGroupRecords[1].Id);
                    Assert.Equal(structuresArray[2 * r + 1].NullableId, rowGroupRecords[1].NullableId);
                    Assert.Equal(structuresArray[2 * r + 1].Name, rowGroupRecords[1].Name);
                    Assert.Equal(structuresArray[2 * r + 1].Date, rowGroupRecords[1].Date);
                }
                Assert.Throws <ArgumentOutOfRangeException>("index", () => ParquetConvert.Deserialize <SimpleStructure>(ms, 5));
                Assert.Throws <ArgumentOutOfRangeException>("index", () => ParquetConvert.Deserialize <SimpleStructure>(ms, 99999));
            }
        }
        /// <summary>
        /// Provid an IEnumeragle interface to the Parquet.Net Deserialization of Data for efficient
        /// processinga with support for Linq processing if/when needed.
        /// WARNING: Care must be taken to prevent Multiple Enumerations unnecessarily such as being sure
        ///             to project filtered results into a List (e.g. ToList())!
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <returns></returns>
        public IEnumerable <T> Read <T>() where T : new()
        {
            AssertParquetReaderIsOpen();

            //It seemst hat the only API that works consistently is this one whereby we loop the RowGroups ourselves
            //  but this also provies a little more control over the IEnumerable processing by not forcing all rows & row-groups
            //  to be in memory at one time.
            for (int g = 0; g < _parquetReader.RowGroupCount; g++)
            {
                LogDebug($"Enumerating over RowGroup #[{g}]...");
                var timer = Stopwatch.StartNew();

                var group = ParquetConvert.Deserialize <T>(_blobStream, g);

                timer.Stop();
                LogDebug($"Deserialized RowGroup [{g}] from the Stream in [{timer.ToElapsedTimeDescriptiveFormat()}].");

                foreach (var item in group)
                {
                    yield return(item);
                }
            }
        }
Ejemplo n.º 13
0
 public async Task OpenVideos()
 {
     using (var s = File.OpenRead("C:\\Users\\mark\\Downloads\\Videos.0.parquet")) {
         var rows = ParquetConvert.Deserialize <VideoRow>(s);
     }
 }
        public static async Task <IActionResult> Run(
            [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post", Route = null)] HttpRequest req,
            ILogger log)
        {
            log.LogInformation("C# HTTP trigger function processed a request.");

            // Connect to Blob Storage using a connection string
            // Assumes there is a container called 'files'
            string connectionString       = "<CONNECTION STRING>";
            BlobContainerClient container = new BlobContainerClient(connectionString, "files");

            // Initialise our list of objects
            List <UserData> userDataList = new List <UserData>();

            // Loop through each of the files in the container
            foreach (var file in container.GetBlobs())
            {
                // Initialise a BlobClient so we can worj with the named file
                var blockBlob = container.GetBlobClient(file.Name);

                // Initialise a unique temporary path to hold the Parquet file
                var tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());

                // Download the file to the temporary path
                using (var fileStream = File.OpenWrite(tempPath + blockBlob.Name))
                {
                    blockBlob.DownloadTo(fileStream);
                }

                // Open file stream
                using (Stream fileStream = File.OpenRead(tempPath + blockBlob.Name))
                {
                    // Open parquet file reader
                    using (var parquetReader = new ParquetReader(fileStream))
                    {
                        // Enumerate through row groups in this file
                        for (int i = 0; i < parquetReader.RowGroupCount; i++)
                        {
                            // Create row group reader
                            using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i))
                            {
                                // Deserialize the rows into an array
                                UserData[] v1structures = ParquetConvert.Deserialize <UserData>(fileStream, i);

                                // Add each item of our array to our overall list
                                foreach (var row in v1structures)
                                {
                                    userDataList.Add(row);
                                }
                            }
                        }
                    }
                }
            }

            // Check we have rows
            if (userDataList.Count > 0)
            {
                // Return the list as a JSON array
                string json = JsonConvert.SerializeObject(userDataList.ToArray(), Formatting.Indented);

                return(new OkObjectResult(json));
            }
            else
            {
                return(new BadRequestResult());
            }
        }