public static void TestArrayEdgeCasesRoundtrip()
        {
            /*
             * [None, [], [1.0, None, 2.0]]
             * []
             * None
             * [[]]
             */
            var expected = new double?[][][]
            {
                new double?[][] { null, new double?[] { }, new double?[] { 1.0, null, 2.0 } },
                new double?[][] { },
                null,
                new double?[][] { new double?[] { } }
            };

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, new Column[] { new Column <double?[][]>("a") });
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var colWriter      = rowGroupWriter.NextColumn().LogicalWriter <double?[][]>();

                colWriter.WriteBatch(expected);

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <double?[][]>();

            Assert.AreEqual(4, rowGroup.MetaData.NumRows);
            var allData = columnReader.ReadAll(4);

            Assert.AreEqual(expected, allData);
        }
        public static void TestDisposeExceptionSafety_ColumnWriter()
        {
            var exception = Assert.Throws <Exception>(() =>
            {
                using var buffer      = new ResizableBuffer();
                using var outStream   = new BufferOutputStream(buffer);
                using var fileWriter  = new ParquetFileWriter(outStream, new Column[] { new Column <int>("Index"), new Column <float>("Value") });
                using var groupWriter = fileWriter.AppendRowGroup();

                using (var writer = groupWriter.NextColumn().LogicalWriter <int>())
                {
                    writer.WriteBatch(new[] { 1, 2, 3, 4, 5, 6 });
                }

                using (var writer = groupWriter.NextColumn().LogicalWriter <float>())
                {
                    throw new Exception("this is the expected message");
                }
            });

            Assert.That(exception?.Message, Contains.Substring("this is the expected message"));
        }
Example #3
0
        public static unsafe void TestParquetReadFromBuffer()
        {
            var expected = Enumerable.Range(0, 100).ToArray();

            // Write out a single column
            byte[] parquetFileBytes;
            using (var outBuffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(outBuffer))
                {
                    using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") });

                    using (var rowGroupWriter = fileWriter.AppendRowGroup())
                    {
                        using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>();
                        colWriter.WriteBatch(expected);
                    }

                    fileWriter.Close();
                }

                parquetFileBytes = outBuffer.ToArray();
            }

            // Read it back
            fixed(byte *fixedBytes = parquetFileBytes)
            {
                using var buffer       = new IO.Buffer(new IntPtr(fixedBytes), parquetFileBytes.Length);
                using var inStream     = new BufferReader(buffer);
                using var fileReader   = new ParquetFileReader(inStream);
                using var rowGroup     = fileReader.RowGroup(0);
                using var columnReader = rowGroup.Column(0).LogicalReader <int>();

                var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);

                Assert.AreEqual(expected, allData);
            }
        }
Example #4
0
        public static void TestRoundTrip(
            // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0
            [Values(2, 8, 32, 128)] int rowsPerBatch,
            [Values(7, 49, 343, 2401)] int writeBufferLength,
            [Values(11, 121, 1331)] int readBufferLength,
            [Values(true, false)] bool useDictionaryEncoding
            )
        {
            var expectedColumns = CreateExpectedColumns();
            var schemaColumns   = expectedColumns
                                  .Select(c => new Column(c.Values.GetType().GetElementType() ?? throw new InvalidOperationException(), c.Name, c.LogicalTypeOverride))
                                  .ToArray();

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding);
                using var fileWriter       = new ParquetFileWriter(outStream, schemaColumns, writerProperties);
                using var rowGroupWriter   = fileWriter.AppendRowGroup();

                foreach (var column in expectedColumns)
                {
                    Console.WriteLine("Writing '{0}' ({1})", column.Name, column.Values.GetType().GetElementType());

                    using var columnWriter = rowGroupWriter.NextColumn().LogicalWriter(writeBufferLength);
                    columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch));
                }

                fileWriter.Close();
            }

            Console.WriteLine();

            // Read back the columns and make sure they match.
            AssertReadRoundtrip(rowsPerBatch, readBufferLength, buffer, expectedColumns);
        }
        private static ResizableBuffer WriteTestValues <TValue>(TValue[] written)
        {
            var buffer = new ResizableBuffer();

            try
            {
                using var output       = new BufferOutputStream(buffer);
                using var fileWriter   = new ParquetFileWriter(output, new Column[] { new Column <TValue>("values") });
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <TValue>();

                columnWriter.WriteBatch(written);
                fileWriter.Close();

                return(buffer);
            }

            catch
            {
                buffer.Dispose();
                throw;
            }
        }
        private static void TestWriteNoWriterOverride <TValue, TCustom>(TValue[] expected, TCustom[] written)
        {
            using var buffer = new ResizableBuffer();

            // Write float values using a custom user-type:
            // - Provide a type factory such that Column<VolumeInDollars> can be converted to the right schema node.
            // - Provide a converter factory such that VolumeInDollars values can be written as floats.
            // - Do not explicitly override the expected type when accessing the LogicalColumnWriter.

            using (var output = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(output, new Column[] { new Column <TCustom>("values") }, new WriteTypeFactory())
                      {
                          LogicalWriteConverterFactory = new WriteConverterFactory()
                      };
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <TCustom>();

                columnWriter.WriteBatch(written);
                fileWriter.Close();
            }

            CheckWrittenValues(buffer, expected);
        }
Example #7
0
        public static void TestReaderWriteTypes(
            // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0
            [Values(2, 8, 32, 128)] int rowsPerBatch,
            [Values(7, 49, 343, 2401)] int writeBufferLength,
            [Values(11, 121, 1331)] int readBufferLength
            )
        {
            var expectedColumns = CreateExpectedColumns();
            var schemaColumns   = expectedColumns.Select(c => new Column(c.Values.GetType().GetElementType(), c.Name, c.LogicalTypeOverride)).ToArray();

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(outStream, schemaColumns);

                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    foreach (var column in expectedColumns)
                    {
                        Console.WriteLine("Writing '{0}' ({1})", column.Name, column.Values.GetType().GetElementType());

                        using var columnWriter = rowGroupWriter.NextColumn().LogicalWriter(writeBufferLength);
                        columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch));
                    }
                }

                fileWriter.Close();
            }

            Console.WriteLine();

            // Read back the columns and make sure they match.
            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var fileMetaData   = fileReader.FileMetaData;
            using var rowGroupReader = fileReader.RowGroup(0);

            var rowGroupMetaData = rowGroupReader.MetaData;
            var numRows          = rowGroupMetaData.NumRows;

            for (int c = 0; c != fileMetaData.NumColumns; ++c)
            {
                var expected = expectedColumns[c];

                // Test properties, and read methods.
                using (var columnReader = rowGroupReader.Column(c).LogicalReader(readBufferLength))
                {
                    var descr         = columnReader.ColumnDescriptor;
                    var chunkMetaData = rowGroupMetaData.GetColumnChunkMetaData(c);
                    var statistics    = chunkMetaData.Statistics;

                    Console.WriteLine("Reading '{0}'", expected.Name);

                    Assert.AreEqual(expected.PhysicalType, descr.PhysicalType);
                    Assert.AreEqual(expected.LogicalType, descr.LogicalType);
                    Assert.AreEqual(expected.Values, columnReader.Apply(new LogicalValueGetter(checked ((int)numRows), rowsPerBatch)));
                    Assert.AreEqual(expected.Length, descr.TypeLength);
                    Assert.AreEqual((expected.LogicalType as DecimalLogicalType)?.Precision ?? -1, descr.TypePrecision);
                    Assert.AreEqual((expected.LogicalType as DecimalLogicalType)?.Scale ?? -1, descr.TypeScale);
                    Assert.AreEqual(expected.HasStatistics, chunkMetaData.IsStatsSet);

                    if (expected.HasStatistics)
                    {
                        Assert.AreEqual(expected.HasMinMax, statistics.HasMinMax);
                        //Assert.AreEqual(expected.NullCount, statistics.NullCount);
                        //Assert.AreEqual(expected.NumValues, statistics.NumValues);
                        Assert.AreEqual(expected.PhysicalType, statistics.PhysicalType);

                        // BUG Don't check for decimal until https://issues.apache.org/jira/browse/ARROW-6149 is fixed.
                        var buggy = expected.LogicalType is DecimalLogicalType;

                        if (expected.HasMinMax && !buggy)
                        {
                            Assert.AreEqual(expected.Min, expected.Converter(statistics.MinUntyped));
                            Assert.AreEqual(expected.Max, expected.Converter(statistics.MaxUntyped));
                        }
                    }
                    else
                    {
                        Assert.IsNull(statistics);
                    }
                }

                // Test IEnumerable interface
                using (var columnReader = rowGroupReader.Column(c).LogicalReader(readBufferLength))
                {
                    Assert.AreEqual(expected.Values, columnReader.Apply(new LogicalColumnReaderToArray()));
                }
            }
        }
        public static void TestReaderWriteTypes()
        {
            var schema           = CreateSchema();
            var writerProperties = CreateWriterProperties();
            var keyValueMetadata = new Dictionary <string, string> {
                { "case", "Test" }, { "Awesome", "true" }
            };
            var expectedColumns = CreateExpectedColumns();

            using (var buffer = new ResizableBuffer())
            {
                // Write our expected columns to the parquet in-memory file.
                using (var outStream = new BufferOutputStream(buffer))
                    using (var fileWriter = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata))
                        using (var rowGroupWriter = fileWriter.AppendRowGroup())
                        {
                            foreach (var column in expectedColumns)
                            {
                                Console.WriteLine("Writing '{0}'", column.Name);

                                using (var columnWriter = rowGroupWriter.NextColumn())
                                {
                                    columnWriter.Apply(new ValueSetter(column.Values));
                                }
                            }
                        }

                // Read back the columns and make sure they match.
                using (var inStream = new BufferReader(buffer))
                    using (var fileReader = new ParquetFileReader(inStream))
                    {
                        var fileMetaData = fileReader.FileMetaData;

                        Assert.AreEqual("parquet-cpp version 1.5.1-SNAPSHOT", fileMetaData.CreatedBy);
                        Assert.AreEqual(new Dictionary <string, string> {
                            { "case", "Test" }, { "Awesome", "true" }
                        }, fileMetaData.KeyValueMetadata);
                        Assert.AreEqual(expectedColumns.Length, fileMetaData.NumColumns);
                        Assert.AreEqual(NumRows, fileMetaData.NumRows);
                        Assert.AreEqual(1, fileMetaData.NumRowGroups);
                        Assert.AreEqual(1 + expectedColumns.Length, fileMetaData.NumSchemaElements);
                        Assert.AreEqual(ParquetVersion.PARQUET_1_0, fileMetaData.Version);
                        Assert.AreEqual("parquet-cpp version 1.5.1", fileMetaData.WriterVersion.ToString());

                        using (var rowGroupReader = fileReader.RowGroup(0))
                        {
                            var rowGroupMetaData = rowGroupReader.MetaData;

                            for (int c = 0; c != fileMetaData.NumColumns; ++c)
                            {
                                using (var columnReader = rowGroupReader.Column(c))
                                {
                                    var expected = expectedColumns[c];

                                    Console.WriteLine("Reading '{0}'", expected.Name);

                                    var descr         = columnReader.ColumnDescriptor;
                                    var chunkMetaData = rowGroupMetaData.GetColumnChunkMetaData(0);

                                    Assert.AreEqual(expected.MaxDefinitionlevel, descr.MaxDefinitionLevel);
                                    Assert.AreEqual(expected.MaxRepetitionLevel, descr.MaxRepetitionLevel);
                                    Assert.AreEqual(expected.PhysicalType, descr.PhysicalType);
                                    Assert.AreEqual(expected.LogicalType, descr.LogicalType);
                                    Assert.AreEqual(expected.ColumnOrder, descr.ColumnOrder);
                                    Assert.AreEqual(expected.SortOrder, descr.SortOrder);
                                    Assert.AreEqual(expected.Name, descr.Name);
                                    Assert.AreEqual(expected.TypeLength, descr.TypeLength);
                                    Assert.AreEqual(expected.TypePrecision, descr.TypePrecision);
                                    Assert.AreEqual(expected.TypeScale, descr.TypeScale);

                                    Assert.AreEqual(expected.Encodings, chunkMetaData.Encodings);
                                    Assert.AreEqual(expected.Compression, chunkMetaData.Compression);
                                    Assert.AreEqual(expected.Values, columnReader.Apply(new PhysicalValueGetter(chunkMetaData.NumValues)).values);
                                }
                            }
                        }
                    }
            }
        }
        public static void TestBigFileBufferedRowGroup()
        {
            // Test a large amount of rows with a buffered row group to uncover any particular issue.
            const int numBatches = 64;
            const int batchSize  = 8192;

            using var buffer = new ResizableBuffer();

            using (var output = new BufferOutputStream(buffer))
            {
                var columns = new Column[]
                {
                    new Column <int>("int"),
                    new Column <double>("double"),
                    new Column <string>("string"),
                    new Column <bool>("bool")
                };

                using var builder          = new WriterPropertiesBuilder();
                using var writerProperties = builder.Compression(Compression.Snappy).DisableDictionary("double").Build();
                using var fileWriter       = new ParquetFileWriter(output, columns, writerProperties);
                using var rowGroupWriter   = fileWriter.AppendBufferedRowGroup();

                using var col0 = rowGroupWriter.Column(0).LogicalWriter <int>();
                using var col1 = rowGroupWriter.Column(1).LogicalWriter <double>();
                using var col2 = rowGroupWriter.Column(2).LogicalWriter <string>();
                using var col3 = rowGroupWriter.Column(3).LogicalWriter <bool>();

                for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex)
                {
                    var startIndex = batchSize * batchIndex;

                    col0.WriteBatch(Enumerable.Range(startIndex, batchSize).ToArray());
                    col1.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray());
                    col2.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray());
                    col3.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray());
                }

                fileWriter.Close();
            }

            using (var input = new BufferReader(buffer))
            {
                using var fileReader     = new ParquetFileReader(input);
                using var rowGroupReader = fileReader.RowGroup(0);

                using var col0 = rowGroupReader.Column(0).LogicalReader <int>();
                using var col1 = rowGroupReader.Column(1).LogicalReader <double>();
                using var col2 = rowGroupReader.Column(2).LogicalReader <string>();
                using var col3 = rowGroupReader.Column(3).LogicalReader <bool>();

                for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex)
                {
                    var startIndex = batchSize * batchIndex;

                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).ToArray(), col0.ReadAll(batchSize));
                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray(), col1.ReadAll(batchSize));
                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray(), col2.ReadAll(batchSize));
                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray(), col3.ReadAll(batchSize));
                }

                fileReader.Close();
            }
        }
Example #10
0
        public static void TestProperties()
        {
            // Test the various properties exposed by ParquetFileWriter.

            using var writerPropertiesBuilder = new WriterPropertiesBuilder();
            using var writerProperties        = writerPropertiesBuilder
                                                .Compression(Compression.Zstd)
                                                .DisableDictionary()
                                                .CreatedBy("Some crazy unit test")
                                                .Build();

            var columns = new Column[]
            {
                new Column <int>("Index"),
                new Column <float>("Value")
            };

            var kvm = (IReadOnlyDictionary <string, string>) new Dictionary <string, string>
            {
                { "some key", "some value" },
                { "α ∧ ¬β", "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm" }
            };

            using var buffer     = new ResizableBuffer();
            using var outStream  = new BufferOutputStream(buffer);
            using var fileWriter = new ParquetFileWriter(outStream, columns, writerProperties, kvm);

            Assert.AreEqual(2, fileWriter.NumColumns);
            Assert.AreEqual(0, fileWriter.NumRows);
            Assert.AreEqual(0, fileWriter.NumRowGroups);
            Assert.IsNull(fileWriter.FileMetaData);
            Assert.AreEqual(Column.CreateSchemaNode(columns), fileWriter.Schema.GroupNode);
            Assert.AreEqual(Column.CreateSchemaNode(columns), fileWriter.Schema.SchemaRoot);
            Assert.AreEqual(columns[0].Name, fileWriter.ColumnDescriptor(0).Name);
            Assert.AreEqual(columns[1].Name, fileWriter.ColumnDescriptor(1).Name);
            Assert.AreEqual(kvm, fileWriter.KeyValueMetadata);

            Assert.AreEqual(Compression.Zstd, fileWriter.WriterProperties.Compression(new ColumnPath("")));
            Assert.AreEqual(false, fileWriter.WriterProperties.DictionaryEnabled(new ColumnPath("")));
            Assert.AreEqual("Some crazy unit test", fileWriter.WriterProperties.CreatedBy);

            using (var groupWriter = fileWriter.AppendRowGroup())
            {
                Assert.AreEqual(0, fileWriter.NumRows);
                Assert.AreEqual(1, fileWriter.NumRowGroups);
                Assert.IsNull(fileWriter.FileMetaData);

                using (var writer = groupWriter.NextColumn().LogicalWriter <int>())
                {
                    writer.WriteBatch(new[] { 1, 2, 3, 4, 5, 6 });
                }

                Assert.AreEqual(0, fileWriter.NumRows);
                Assert.AreEqual(1, fileWriter.NumRowGroups);
                Assert.IsNull(fileWriter.FileMetaData);

                using (var writer = groupWriter.NextColumn().LogicalWriter <float>())
                {
                    writer.WriteBatch(new[] { 1f, 2f, 3f, 4f, 5f, 6f });
                }

                Assert.AreEqual(0, fileWriter.NumRows);
                Assert.AreEqual(1, fileWriter.NumRowGroups);
                Assert.IsNull(fileWriter.FileMetaData);
            }

            Assert.AreEqual(0, fileWriter.NumRows);
            Assert.AreEqual(1, fileWriter.NumRowGroups);
            Assert.IsNull(fileWriter.FileMetaData);

            fileWriter.Close();

            //Assert.AreEqual(0, fileWriter.NumRows); // 2021-04-08: calling this results in a segfault when the writer has been closed
            //Assert.AreEqual(1, fileWriter.NumRowGroups); // 2021-04-08: calling this results in a segfault when the writer has been closed
            Assert.IsNotNull(fileWriter.FileMetaData);
            Assert.AreEqual(2, fileWriter.FileMetaData?.NumColumns);
            Assert.AreEqual(6, fileWriter.FileMetaData?.NumRows);
            Assert.AreEqual(1, fileWriter.FileMetaData?.NumRowGroups);
            Assert.AreEqual(kvm, fileWriter.FileMetaData?.KeyValueMetadata);
        }
Example #11
0
 public UtilStreamBuffer(int initialsize)
 {
     buf          = new byte[initialsize];
     inputStream  = new BufferInputStream();
     outputStream = new BufferOutputStream();
 }