Пример #1
0
        public (DateTime[] dateTimes, int[] objectIds, float[] values) ParquetSharp()
        {
            using var fileReader  = new ParquetFileReader(Filename);
            using var groupReader = fileReader.RowGroup(0);

            DateTime[] dateTimes;
            using (var dateTimeReader = groupReader.Column(0).LogicalReader <DateTime>())
            {
                dateTimes = dateTimeReader.ReadAll(_numRows);
            }

            int[] objectIds;
            using (var objectIdReader = groupReader.Column(1).LogicalReader <int>())
            {
                objectIds = objectIdReader.ReadAll(_numRows);
            }

            float[] values;
            using (var valueReader = groupReader.Column(2).LogicalReader <float>())
            {
                values = valueReader.ReadAll(_numRows);
            }

            fileReader.Close();

            if (Check.Enabled)
            {
                Check.ArraysAreEqual(_allDates, dateTimes);
                Check.ArraysAreEqual(_allObjectIds, objectIds);
                Check.ArraysAreEqual(_allValues, values);
            }

            return(dateTimes, objectIds, values);
        }
Пример #2
0
        public static void TestHasNext()
        {
            const int numRows       = 5;
            var       schemaColumns = new Column[] { new Column <int>("int32_field") };
            var       values        = Enumerable.Range(0, numRows).ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writer         = new ParquetFileWriter(outStream, schemaColumns);
                using var rowGroupWriter = writer.AppendRowGroup();
                using var colWriter      = (ColumnWriter <int>)rowGroupWriter.NextColumn();

                colWriter.WriteBatch(values);

                writer.Close();
            }

            // Read back the columns and make sure they match.
            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var rowGroupReader = fileReader.RowGroup(0);
            using var column         = (ColumnReader <int>)rowGroupReader.Column(0);

            var read = new int[1024];

            column.ReadBatch(1024, read, out var numValues);

            Assert.AreEqual(numValues, numRows);
            Assert.AreEqual(values, read.AsSpan(0, numRows).ToArray());
            Assert.IsFalse(column.HasNext);
        }
        public static void TestInMemoryRoundTrip()
        {
            var expected = Enumerable.Range(0, 1024 * 1024).ToArray();

            using var buffer = new MemoryStream();

            // Write test data.
            using (var output = new ManagedOutputStream(buffer, leaveOpen: true))
            {
                using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                using var groupWriter  = writer.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                columnWriter.WriteBatch(expected);

                writer.Close();
            }

            // Seek back to start.
            buffer.Seek(0, SeekOrigin.Begin);

            // Read test data.
            using var input        = new ManagedRandomAccessFile(buffer, leaveOpen: true);
            using var reader       = new ParquetFileReader(input);
            using var groupReader  = reader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReader <int>();

            Assert.AreEqual(expected, columnReader.ReadAll(expected.Length));
        }
Пример #4
0
        public static void TestFileHandleHasBeenReleased()
        {
            var exception = Assert.Throws <InvalidCastException>(() =>
            {
                try
                {
                    using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") }))
                    {
                        using var groupWriter  = writer.AppendRowGroup();
                        using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                        columnWriter.WriteBatch(new[] { 1, 2, 3 });

                        writer.Close();
                    }

                    // Open with the wrong logical reader type on purpose.
                    using var reader       = new ParquetFileReader("file.parquet");
                    using var groupReader  = reader.RowGroup(0);
                    using var columnReader = groupReader.Column(0).LogicalReader <float>();

                    Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3));
                }
                finally
                {
                    // This will throw on Windows if the file handle has not been released.
                    File.Delete("file.parquet");
                }
            });

            StringAssert.StartsWith("Unable to cast object of type", exception?.Message);
        }
Пример #5
0
        public static void TestArrayOfEmptyStringArraysRoundtrip()
        {
            var expected = new[]
            {
                new string[] { },
                new string[] { },
                new string[] { },
                new string[] { }
            };

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string[]>("a") });

                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <string[]>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <string[]>();

            Assert.AreEqual(4, rowGroup.MetaData.NumRows);
            var allData = columnReader.ReadAll(4);

            Assert.AreEqual(expected, allData);
        }
        public static void TestFileStreamRoundTrip()
        {
            try
            {
                using (var output = new ManagedOutputStream(File.OpenWrite("file.parquet")))
                {
                    using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                    using var groupWriter  = writer.AppendRowGroup();
                    using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                    columnWriter.WriteBatch(new[] { 1, 2, 3 });

                    writer.Close();
                }

                using var input        = new ManagedRandomAccessFile(File.OpenRead("file.parquet"));
                using var reader       = new ParquetFileReader(input);
                using var groupReader  = reader.RowGroup(0);
                using var columnReader = groupReader.Column(0).LogicalReader <int>();

                Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3));
            }
            finally
            {
                File.Delete("file.parquet");
            }
        }
        public static void TestFileHandleHasBeenReleased()
        {
            var exception = Assert.Throws <InvalidCastException>(() =>
            {
                try
                {
                    using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") }))
                        using (var group = writer.AppendRowGroup())
                            using (var column = group.NextColumn().LogicalWriter <int>())
                            {
                                column.WriteBatch(new[] { 1, 2, 3 });
                            }

                    // Open with the wrong logical reader type on purpose.
                    using (var reader = new ParquetFileReader("file.parquet"))
                        using (var group = reader.RowGroup(0))
                            using (var column = group.Column(0).LogicalReader <float>())
                            {
                                Assert.AreEqual(new[] { 1, 2, 3 }, column.ReadAll(3));
                            }
                }
                finally
                {
                    // This will throw on Windows if the file handle has not been released.
                    File.Delete("file.parquet");
                }
            });

            Assert.AreEqual(
                "Unable to cast object of type " +
                "'ParquetSharp.LogicalColumnReader`3[System.Int32,System.Int32,System.Int32]'" +
                " to type 'ParquetSharp.LogicalColumnReader`1[System.Single]'.",
                exception.Message);
        }
Пример #8
0
        public static unsafe void TestParquetReadFromBuffer()
        {
            var expected = Enumerable.Range(0, 100).ToArray();

            // Write out a single column
            byte[] parquetFileBytes;
            using (var outBuffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(outBuffer))
                    using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") }))
                        using (var rowGroupWriter = fileWriter.AppendRowGroup())
                            using (var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                            {
                                colWriter.WriteBatch(expected);
                            }

                parquetFileBytes = outBuffer.ToArray();
            }

            // Read it back
            fixed(byte *fixedBytes = parquetFileBytes)
            using (var buffer = new IO.Buffer(new IntPtr(fixedBytes), parquetFileBytes.Length))
                using (var inStream = new BufferReader(buffer))
                    using (var fileReader = new ParquetFileReader(inStream))
                        using (var rowGroup = fileReader.RowGroup(0))
                            using (var columnReader = rowGroup.Column(0).LogicalReader <int>())
                            {
                                var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);
                                Assert.AreEqual(expected, allData);
                            }
        }
Пример #9
0
        private static void AssertReadRoundtrip(int rowsPerBatch, int readBufferLength, ResizableBuffer buffer, ExpectedColumn[] expectedColumns)
        {
            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var fileMetaData   = fileReader.FileMetaData;
            using var rowGroupReader = fileReader.RowGroup(0);

            var rowGroupMetaData = rowGroupReader.MetaData;
            var numRows          = rowGroupMetaData.NumRows;

            for (int c = 0; c != fileMetaData.NumColumns; ++c)
            {
                var expected = expectedColumns[c];

                // Test properties, and read methods.
                using (var columnReader = rowGroupReader.Column(c).LogicalReader(readBufferLength))
                {
                    var descr         = columnReader.ColumnDescriptor;
                    var chunkMetaData = rowGroupMetaData.GetColumnChunkMetaData(c);
                    var statistics    = chunkMetaData.Statistics;

                    Console.WriteLine("Reading '{0}'", expected.Name);

                    Assert.AreEqual(expected.PhysicalType, descr.PhysicalType);
                    Assert.AreEqual(expected.LogicalType, descr.LogicalType);
                    Assert.AreEqual(expected.Values, columnReader.Apply(new LogicalValueGetter(checked ((int)numRows), rowsPerBatch)));
                    Assert.AreEqual(expected.Length, descr.TypeLength);
                    Assert.AreEqual((expected.LogicalType as DecimalLogicalType)?.Precision ?? -1, descr.TypePrecision);
                    Assert.AreEqual((expected.LogicalType as DecimalLogicalType)?.Scale ?? -1, descr.TypeScale);
                    Assert.AreEqual(expected.HasStatistics, chunkMetaData.IsStatsSet);

                    if (expected.HasStatistics)
                    {
                        Assert.AreEqual(expected.HasMinMax, statistics.HasMinMax);
                        //Assert.AreEqual(expected.NullCount, statistics.NullCount);
                        //Assert.AreEqual(expected.NumValues, statistics.NumValues);
                        Assert.AreEqual(expected.PhysicalType, statistics.PhysicalType);

                        // BUG Don't check for decimal until https://issues.apache.org/jira/browse/ARROW-6149 is fixed.
                        var buggy = expected.LogicalType is DecimalLogicalType;

                        if (expected.HasMinMax && !buggy)
                        {
                            Assert.AreEqual(expected.Min, expected.Converter(statistics.MinUntyped));
                            Assert.AreEqual(expected.Max, expected.Converter(statistics.MaxUntyped));
                        }
                    }
                    else
                    {
                        Assert.IsNull(statistics);
                    }
                }

                // Test IEnumerable interface
                using (var columnReader = rowGroupReader.Column(c).LogicalReader(readBufferLength))
                {
                    Assert.AreEqual(expected.Values, columnReader.Apply(new LogicalColumnReaderToArray()));
                }
            }
        }
Пример #10
0
        public static void TestWriteLongString()
        {
            const int numStrings = 100;

            // Generate lots of digits of 0.1234567891011121131415...
            var strings = Enumerable.Range(0, numStrings).Select(i => "0." + string.Join("", Enumerable.Range(1, 3500).Select(j => j.ToString())) + "...").ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter   = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") });
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>();

                // Strings to byte arrays memory pooling is done by the ByteBuffer class.
                // If something is fishy there (e.g. bad memory ownership wrt the GC),
                // we expect to see consequences here if we write enough strings.
                // It's not bullet proof, but it has found a few issues.
                columnWriter.WriteBatch(strings);

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var groupReader  = fileReader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReader <string>();

            Assert.AreEqual(strings, columnReader.ReadAll(numStrings));
        }
Пример #11
0
        public static void Demo()
        {
            // open input and output file streams
            Stream inputFile  = File.OpenRead(".\\ResourceFiles\\userdata1.parquet");
            Stream outputFile = File.OpenWrite(".\\ResourceFiles\\out1.parquet");

            // Create reader
            using ParquetFileReader reader = new ParquetFileReader(inputFile);

            // Copy source settings as target settings
            List <FileEncryptionSettings> writerSettings = reader.FileEncryptionSettings
                                                           .Select(s => Copy(s))
                                                           .ToList();

            // Modify a few column settings
            writerSettings[0]  = new FileEncryptionSettings <DateTimeOffset?>(encryptionKey, SqlSerializerFactory.Default.GetDefaultSerializer <DateTimeOffset?>());
            writerSettings[3]  = new FileEncryptionSettings <string>(encryptionKey, EncryptionType.Deterministic, new SqlVarcharSerializer(size: 255));
            writerSettings[10] = new FileEncryptionSettings <double?>(encryptionKey, StandardSerializerFactory.Default.GetDefaultSerializer <double?>());

            // Create and pass the target settings to the writer
            using ParquetFileWriter writer = new ParquetFileWriter(outputFile, writerSettings);

            // Process the file
            ColumnarCryptographer cryptographer = new ColumnarCryptographer(reader, writer);

            cryptographer.Transform();

            Console.Clear();
        }
Пример #12
0
        public static void TestWriteBatchWithNullOptionalField()
        {
            using (var buffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(buffer))
                    using (var writer = new ParquetFileWriter(outStream, new Column[] { new Column <int?>("int32?") }))
                        using (var rowGroupWriter = writer.AppendRowGroup())
                            using (var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn())
                            {
                                var defLevels = new short[] { 1, 0, 1 };
                                var values    = new[] { 1, 2 };

                                colWriter.WriteBatch(defLevels.Length, defLevels, null, values);
                            }

                using (var inStream = new BufferReader(buffer))
                    using (var reader = new ParquetFileReader(inStream))
                        using (var rowGroupReader = reader.RowGroup(0))
                            using (var colReader = rowGroupReader.Column(0).LogicalReader <int?>())
                            {
                                var results = new int?[3];
                                colReader.ReadBatch(results, 0, 3);

                                Assert.AreEqual(new int?[] { 1, null, 2 }, results);
                            }
            }
        }
Пример #13
0
        public static void TestBufferOutputStreamFinish()
        {
            var expected = Enumerable.Range(0, 100).ToArray();

            using var outStream = new BufferOutputStream();

            // Write out a single column
            using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") }))
            {
                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            // Read it back
            using var buffer       = outStream.Finish();
            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <int>();

            var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);

            Assert.AreEqual(expected, allData);
        }
Пример #14
0
        public void DecryptParquetFileCorrectly()
        {
            using Stream inputFile         = File.OpenRead("ResourceFiles\\ciphertext.parquet");
            using Stream outputFile        = File.OpenWrite($"ResourceFiles\\{nameof(DecryptParquetFileCorrectly)}_out.parquet");
            using ParquetFileReader reader = new ParquetFileReader(inputFile);

            reader.RegisterKeyStoreProviders(
                new Dictionary <string, EncryptionKeyStoreProvider> {
                [azureKeyProvider.ProviderName] = azureKeyProvider
            }
                );

            var writerSettings = reader.FileEncryptionSettings
                                 .Select(s => (FileEncryptionSettings)s.Clone())
                                 .ToList();

            var targetColumnTypes = reader.FileEncryptionSettings
                                    .Select(s => s.GetSerializer().GetGenericType())
                                    .ToList();

            writerSettings[0]  = Create(targetColumnTypes[0], dataEncryptionKey, EncryptionType.Plaintext, GetSerializer(targetColumnTypes[0]));
            writerSettings[3]  = Create(targetColumnTypes[3], dataEncryptionKey, EncryptionType.Plaintext, GetSerializer(targetColumnTypes[3]));
            writerSettings[10] = Create(targetColumnTypes[10], dataEncryptionKey, EncryptionType.Plaintext, GetSerializer(targetColumnTypes[10]));

            using ParquetFileWriter writer = new ParquetFileWriter(outputFile, writerSettings);

            ColumnarCryptographer cryptographer = new ColumnarCryptographer(reader, writer);

            cryptographer.Transform();
        }
Пример #15
0
        private static void CheckWrittenValues <TValue>(ResizableBuffer buffer, TValue[] expected)
        {
            // Read back regular float values.
            using var input        = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(input);
            using var groupReader  = fileReader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReader <TValue>();

            var values = columnReader.ReadAll(checked ((int)groupReader.MetaData.NumRows));

            Assert.AreEqual(expected, values);
        }
Пример #16
0
        public static void TestReadNoConverterFactory()
        {
            // Test that we cannot read back the values using a custom type without providing a factory.
            using var buffer      = WriteTestValues(Values);
            using var input       = new BufferReader(buffer);
            using var fileReader  = new ParquetFileReader(input);
            using var groupReader = fileReader.RowGroup(0);

            var exception = Assert.Throws <NotSupportedException>(() => groupReader.Column(0).LogicalReaderOverride <VolumeInDollars>());

            StringAssert.StartsWith("unsupported logical system type", exception?.Message);
        }
        private static void AssertReadRoundtrip(ResizableBuffer buffer, ExpectedColumn[] expectedColumns, bool useDictionaryEncoding)
        {
            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var fileMetaData = fileReader.FileMetaData;

            var numRows = expectedColumns.First().Values.Length;

            Assert.AreEqual("parquet-cpp version 1.5.1-SNAPSHOT", fileMetaData.CreatedBy);
            Assert.AreEqual(new Dictionary <string, string> {
                { "case", "Test" }, { "Awesome", "true" }
            }, fileMetaData.KeyValueMetadata);
            Assert.AreEqual(expectedColumns.Length, fileMetaData.NumColumns);
            Assert.AreEqual(numRows, fileMetaData.NumRows);
            Assert.AreEqual(1, fileMetaData.NumRowGroups);
            Assert.AreEqual(1 + expectedColumns.Length, fileMetaData.NumSchemaElements);
            Assert.AreEqual(ParquetVersion.PARQUET_1_0, fileMetaData.Version);
            Assert.AreEqual("parquet-cpp version 1.5.1", fileMetaData.WriterVersion.ToString());

            using var rowGroupReader = fileReader.RowGroup(0);
            var rowGroupMetaData = rowGroupReader.MetaData;

            for (int c = 0; c != fileMetaData.NumColumns; ++c)
            {
                using var columnReader = rowGroupReader.Column(c);

                var expected = expectedColumns[c];

                Console.WriteLine("Reading '{0}'", expected.Name);

                var descr         = columnReader.ColumnDescriptor;
                var chunkMetaData = rowGroupMetaData.GetColumnChunkMetaData(c);

                Assert.AreEqual(expected.MaxDefinitionlevel, descr.MaxDefinitionLevel);
                Assert.AreEqual(expected.MaxRepetitionLevel, descr.MaxRepetitionLevel);
                Assert.AreEqual(expected.PhysicalType, descr.PhysicalType);
                Assert.AreEqual(expected.LogicalType, descr.LogicalType);
                Assert.AreEqual(expected.ColumnOrder, descr.ColumnOrder);
                Assert.AreEqual(expected.SortOrder, descr.SortOrder);
                Assert.AreEqual(expected.Name, descr.Name);
                Assert.AreEqual(expected.TypeLength, descr.TypeLength);
                Assert.AreEqual(expected.TypePrecision, descr.TypePrecision);
                Assert.AreEqual(expected.TypeScale, descr.TypeScale);

                Assert.AreEqual(
                    expected.Encodings.Where(e => useDictionaryEncoding || e != Encoding.PlainDictionary).ToArray(),
                    chunkMetaData.Encodings.Distinct().ToArray());

                Assert.AreEqual(expected.Compression, chunkMetaData.Compression);
                Assert.AreEqual(expected.Values, columnReader.Apply(new PhysicalValueGetter(chunkMetaData.NumValues)).values);
            }
        }
Пример #18
0
        public static void TestSkip()
        {
            const int numRows = 11;

            var schemaColumns = new Column[] { new Column <int>("int32_field") };
            var values        = Enumerable.Range(0, numRows).ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writer = new ParquetFileWriter(outStream, schemaColumns);

                using (var rowGroupWriter = writer.AppendRowGroup())
                {
                    var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn();
                    colWriter.WriteBatch(numRows, values);
                }

                writer.Close();
            }

            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var rowGroupReader = fileReader.RowGroup(0);

            // Read back the columns after skipping numRows and make sure the values are what we expect.
            using (var column = rowGroupReader.Column(0))
            {
                const int numToSkip = 5;

                var skipped = column.Skip(numToSkip);

                Assert.AreEqual(numToSkip, skipped);

                var read = new int[1024];
                ((ColumnReader <int>)column).ReadBatch(1024, read, out var numValues);

                Assert.AreEqual(numValues, numRows - numToSkip);
                Assert.AreEqual(values.AsSpan(numToSkip).ToArray(), read.AsSpan(0, numRows - numToSkip).ToArray());
            }

            // Check skipped is bound to the maximum number of rows.
            using (var column = rowGroupReader.Column(0))
            {
                var skipped = column.Skip(1024);

                Assert.AreEqual(numRows, skipped);
                Assert.IsFalse(column.HasNext);
            }
        }
Пример #19
0
        public static void TestByteStreamSplitEncoding()
        {
            const int numRows = 10230;

            var ids    = Enumerable.Range(0, numRows).ToArray();
            var values = ids.Select(i => i / 3.14f).ToArray();

            using var buffer = new ResizableBuffer();

            using (var output = new BufferOutputStream(buffer))
            {
                var columns = new Column[]
                {
                    new Column <int>("id"),
                    new Column <float>("value")
                };

                var p = new WriterPropertiesBuilder()
                        .Compression(Compression.Lz4)
                        .DisableDictionary("value")
                        .Encoding("value", Encoding.ByteStreamSplit)
                        .Build();

                using var fileWriter  = new ParquetFileWriter(output, columns, p);
                using var groupWriter = fileWriter.AppendRowGroup();

                using var idWriter = groupWriter.NextColumn().LogicalWriter <int>();
                idWriter.WriteBatch(ids);

                using var valueWriter = groupWriter.NextColumn().LogicalWriter <float>();
                valueWriter.WriteBatch(values);

                fileWriter.Close();
            }

            using var input       = new BufferReader(buffer);
            using var fileReader  = new ParquetFileReader(input);
            using var groupReader = fileReader.RowGroup(0);

            using var metadataId    = groupReader.MetaData.GetColumnChunkMetaData(0);
            using var metadataValue = groupReader.MetaData.GetColumnChunkMetaData(1);

            Assert.AreEqual(new[] { Encoding.PlainDictionary, Encoding.Plain, Encoding.Rle }, metadataId.Encodings);
            Assert.AreEqual(new[] { Encoding.ByteStreamSplit, Encoding.Rle }, metadataValue.Encodings);

            using var idReader    = groupReader.Column(0).LogicalReader <int>();
            using var valueReader = groupReader.Column(1).LogicalReader <float>();

            Assert.AreEqual(ids, idReader.ReadAll(numRows));
            Assert.AreEqual(values, valueReader.ReadAll(numRows));
        }
Пример #20
0
        public decimal[] ParquetSharp()
        {
            using var fileReader     = new ParquetFileReader(Filename);
            using var groupReader    = fileReader.RowGroup(0);
            using var dateTimeReader = groupReader.Column(0).LogicalReader <decimal>();
            var results = dateTimeReader.ReadAll(_values.Length);

            if (Check.Enabled)
            {
                Check.ArraysAreEqual(_values, results);
            }

            return(results);
        }
Пример #21
0
        public void CanReadNestedStructure()
        {
            var directory = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location);
            var path      = Path.Combine(directory !, "TestFiles/nested.parquet");

            using var fileReader = new ParquetFileReader(path);
            var rowGroupReader = fileReader.RowGroup(0);

            // first_level_long
            var column0Reader   = rowGroupReader.Column(0).LogicalReader <long?>();
            var column0Actual   = column0Reader.ReadAll(2);
            var column0Expected = new[] { 1, 2 };

            Assert.AreEqual(column0Expected, column0Actual);

            // first_level_nullable_string
            var column1Reader   = rowGroupReader.Column(1).LogicalReader <string?>();
            var column1Actual   = column1Reader.ReadAll(2);
            var column1Expected = new[] { null, "Not Null String" };

            Assert.AreEqual(column1Expected, column1Actual);

            // nullable_struct.nullable_struct_string
            var column2Reader   = rowGroupReader.Column(2).LogicalReader <string?>();
            var column2Actual   = column2Reader.ReadAll(2);
            var column2Expected = new[] { "Nullable Struct String", null };

            Assert.AreEqual(column2Expected, column2Actual);

            // struct.struct_string
            var column3Reader   = rowGroupReader.Column(3).LogicalReader <string>();
            var column3Actual   = column3Reader.ReadAll(2);
            var column3Expected = new[] { "First Struct String", "Second Struct String" };

            Assert.AreEqual(column3Expected, column3Actual);

            // struct_array.array_in_struct_array
            var column4Reader   = rowGroupReader.Column(4).LogicalReader <long?[]?[]>();
            var column4Actual   = column4Reader.ReadAll(2);
            var column4Expected = new[] { new[] { new[] { 111, 112, 113 }, new[] { 121, 122, 123 } }, new[] { new[] { 211, 212, 213 } } };

            Assert.AreEqual(column4Expected, column4Actual);

            // struct_array.string_in_struct_array
            var column5Reader   = rowGroupReader.Column(5).LogicalReader <string[]>();
            var column5Actual   = column5Reader.ReadAll(2);
            var column5Expected = new[] { new[] { "First String", "Second String" }, new[] { "Third String" } };

            Assert.AreEqual(column5Expected, column5Actual);
        }
Пример #22
0
        public static void TestReadNoTypeFactory()
        {
            // Test that we cannot read back the values using a custom type without providing a factory.
            using var buffer      = WriteTestValues(Values);
            using var input       = new BufferReader(buffer);
            using var fileReader  = new ParquetFileReader(input);
            using var groupReader = fileReader.RowGroup(0);

            var exception = Assert.Throws <InvalidCastException>(() =>
            {
                using var reader = groupReader.Column(0).LogicalReader <VolumeInDollars>();
            });

            StringAssert.StartsWith("Unable to cast object of type 'ParquetSharp.LogicalColumnReader`3[System.Single,System.Single,System.Single]", exception?.Message);
        }
Пример #23
0
        public static void TestByteBufferOptimisation()
        {
            const int numStrings = 100_000;

            var strings = Enumerable.Range(0, numStrings).Select(i => i.ToString()).ToArray();

            var cancel = new CancellationTokenSource();
            var task   = Task.Run(() =>
            {
                while (!cancel.IsCancellationRequested)
                {
                    GC.Collect();
                    GC.WaitForPendingFinalizers();
                    Thread.Sleep(1);
                }
            });

            using (var buffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(buffer))
                {
                    using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") });

                    using (var groupWriter = fileWriter.AppendRowGroup())
                    {
                        using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>();

                        // Strings to byte arrays memory pooling is done by the ByteBuffer class.
                        // If something is fishy there (e.g. bad memory ownership wrt the GC),
                        // we expect to see consequences here if we write enough strings.
                        // It's not bullet proof, but it has found a few issues.
                        columnWriter.WriteBatch(strings);
                    }

                    fileWriter.Close();
                }

                using var inStream     = new BufferReader(buffer);
                using var fileReader   = new ParquetFileReader(inStream);
                using var groupReader  = fileReader.RowGroup(0);
                using var columnReader = groupReader.Column(0).LogicalReader <string>();

                Assert.AreEqual(strings, columnReader.ReadAll(numStrings));
            }

            cancel.Cancel();
            task.Wait();
        }
        public static void TestReadingDuplicateStrings([Values(true, false)] bool enableDictionary)
        {
            var columns = new Column[]
            {
                new Column <DateTime>("dateTime"),
                new Column <string>("value")
            };

            const int numRows = 10_000;
            var       rand    = new Random(1);
            var       dates   = Enumerable.Range(0, numRows).Select(i => new DateTime(2020, 01, 01).AddDays(i)).ToArray();
            var       values  = Enumerable.Range(0, numRows).Select(i => (rand.Next(0, 100) * 1000).ToString()).ToArray();

            using var buffer = new ResizableBuffer();

            // Write a file that contains a lot of duplicate strings.
            using (var output = new BufferOutputStream(buffer))
            {
                using var fileWriter  = new ParquetFileWriter(output, columns, CreateWriterProperties(enableDictionary));
                using var groupWriter = fileWriter.AppendRowGroup();

                using var dateWriter = groupWriter.NextColumn().LogicalWriter <DateTime>();
                dateWriter.WriteBatch(dates);

                using var valueWrite = groupWriter.NextColumn().LogicalWriter <string>();
                valueWrite.WriteBatch(values);
            }

            using var input       = new BufferReader(buffer);
            using var fileReader  = new ParquetFileReader(input);
            using var groupReader = fileReader.RowGroup(0);

            using var dateReader = groupReader.Column(0).LogicalReader <DateTime>();
            var readDates = dateReader.ReadAll(numRows);

            using var valueReader = groupReader.Column(1).LogicalReader <string>();
            var readValues = valueReader.ReadAll(numRows);

            Assert.AreEqual(dates, readDates);
            Assert.AreEqual(values, readValues);

            // When reading back the file, we expect the duplicate strings to point to the same memory instances.
            Assert.That(
                readValues.Distinct(new StringReferenceComparer()).Count(),
                enableDictionary ? Is.EqualTo(100) : Is.EqualTo(numRows));
        }
Пример #25
0
        public static void TestBigArrayRoundtrip()
        {
            // Create a big array of float arrays. Try to detect buffer-size related issues.
            var m  = 8196;
            var ar = new float[m];

            for (var i = 0; i < m; i++)
            {
                ar[i] = i;
            }

            var n        = 4;
            var expected = new float[n][];

            for (var i = 0; i < n; i++)
            {
                expected[i] = ar;
            }

            using var buffer = new ResizableBuffer();

            // Write out a single column
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <float[]>("big_array_field") });

                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <float[]>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            // Read it back.
            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <float[]>();

            var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);

            Assert.AreEqual(expected, allData);
        }
Пример #26
0
        public static void TestEncryptJustOneColumn()
        {
            // Case where the footer is unencrypted and all columns are encrypted all with different keys.
            using (var buffer = new ResizableBuffer())
            {
                using (var output = new BufferOutputStream(buffer))
                {
                    using var fileEncryptionProperties = CreateEncryptJustOneColumnProperties();
                    WriteParquetFile(output, fileEncryptionProperties);
                }

                // Decrypt the whole parquet file with matching decrypt properties.
                using (var input = new BufferReader(buffer))
                {
                    using var fileDecryptionProperties = CreateDecryptWithKeyRetrieverProperties();
                    ReadParquetFile(fileDecryptionProperties, input, rowGroupMetadata =>
                    {
                        using var colMetadata0 = rowGroupMetadata.GetColumnChunkMetaData(0);
                        using var colMetadata1 = rowGroupMetadata.GetColumnChunkMetaData(1);
                        using var crypto0      = colMetadata0.CryptoMetadata;
                        using var crypto1      = colMetadata1.CryptoMetadata;

                        Assert.AreEqual(null, crypto0);

                        Assert.AreEqual("Value", crypto1.ColumnPath.ToDotString());
                        Assert.AreEqual(false, crypto1.EncryptedWithFooterKey);
                        Assert.AreEqual("Key2", crypto1.KeyMetadata);
                    });
                }

                // Decrypt only the unencrypted column without providing any decrypt properties.
                using (var input = new BufferReader(buffer))
                {
                    using var fileReader  = new ParquetFileReader(input);
                    using var groupReader = fileReader.RowGroup(0);

                    var numRows = (int)groupReader.MetaData.NumRows;

                    using (var idReader = groupReader.Column(0).LogicalReader <int>())
                    {
                        Assert.AreEqual(Ids, idReader.ReadAll(numRows));
                    }
                }
            }
        }
Пример #27
0
        // Reader tests.

        private static void TestRead <TCustom, TValue>(TCustom[] expected, TValue[] written)
        {
            // Read float values into a custom user-type:
            // - Provide a converter factory such that float values can be written as VolumeInDollars.
            // - Explicitly override the expected type when accessing the LogicalColumnReader.

            using var buffer     = WriteTestValues(written);
            using var input      = new BufferReader(buffer);
            using var fileReader = new ParquetFileReader(input)
                  {
                      LogicalReadConverterFactory = new ReadConverterFactory()
                  };
            using var groupReader  = fileReader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReaderOverride <TCustom>();

            var values = columnReader.ReadAll(checked ((int)groupReader.MetaData.NumRows));

            Assert.AreEqual(expected, values);
        }
Пример #28
0
        public static void TestArrayEdgeCasesRoundtrip()
        {
            /*
             * [None, [], [1.0, None, 2.0]]
             * []
             * None
             * [[]]
             */
            var expected = new double?[][][]
            {
                new double?[][] { null, new double?[] { }, new double?[] { 1.0, null, 2.0 } },
                new double?[][] { },
                null,
                new double?[][] { new double?[] { } }
            };

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <double?[][]>("a") });

                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <double?[][]>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <double?[][]>();

            Assert.AreEqual(4, rowGroup.MetaData.NumRows);
            var allData = columnReader.ReadAll(4);

            Assert.AreEqual(expected, allData);
        }
Пример #29
0
        private static void ReadParquetFile(FileDecryptionProperties fileDecryptionProperties, BufferReader input, Action <RowGroupMetaData> onGroupReader)
        {
            using var readerProperties = CreateReaderProperties(fileDecryptionProperties);
            using var fileReader       = new ParquetFileReader(input, readerProperties);
            using var groupReader      = fileReader.RowGroup(0);

            var metaData = groupReader.MetaData;
            var numRows  = (int)metaData.NumRows;

            onGroupReader?.Invoke(metaData);

            using (var idReader = groupReader.Column(0).LogicalReader <int>())
            {
                Assert.AreEqual(Ids, idReader.ReadAll(numRows));
            }

            using (var valueReader = groupReader.Column(1).LogicalReader <float>())
            {
                Assert.AreEqual(Values, valueReader.ReadAll(numRows));
            }
        }
Пример #30
0
        private void ReadParquetFile(ResizableBuffer buffer, MemoryPool pool)
        {
            using (var input = new BufferReader(buffer))
                using (var fileReader = new ParquetFileReader(input))
                {
                    var kvp = fileReader.FileMetaData.KeyValueMetadata;

                    Assert.AreEqual(_keyValueProperties, kvp);

                    using var rowGroupReader = fileReader.RowGroup(0);

                    var numRows = checked ((int)rowGroupReader.MetaData.NumRows);

                    using var dateTimeReader = rowGroupReader.Column(0).LogicalReader <DateTime>();
                    using var objectIdReader = rowGroupReader.Column(1).LogicalReader <int>();
                    using var valueReader    = rowGroupReader.Column(2).LogicalReader <float>();

                    dateTimeReader.ReadAll(numRows);
                    objectIdReader.ReadAll(numRows);
                    valueReader.ReadAll(numRows);

                    fileReader.Close();
                }
        }