public static void TestPropertiesBuilder()
        {
            var p = new WriterPropertiesBuilder()
                    .Compression(Compression.Snappy)
                    .CompressionLevel(3)
                    .CreatedBy("Meeeee!!!")
                    .DataPagesize(123)
                    .DictionaryPagesizeLimit(456)
                    .Encoding(Encoding.DeltaByteArray)
                    .MaxRowGroupLength(789)
                    .Version(ParquetVersion.PARQUET_1_0)
                    .WriteBatchSize(666)
                    .Build();

            Assert.AreEqual("Meeeee!!!", p.CreatedBy);
            Assert.AreEqual(Compression.Snappy, p.Compression(new ColumnPath("anypath")));
            Assert.AreEqual(3, p.CompressionLevel(new ColumnPath("anypath")));
            Assert.AreEqual(123, p.DataPageSize);
            Assert.AreEqual(Encoding.PlainDictionary, p.DictionaryIndexEncoding);
            Assert.AreEqual(Encoding.PlainDictionary, p.DictionaryPageEncoding);
            Assert.AreEqual(456, p.DictionaryPagesizeLimit);
            Assert.AreEqual(789, p.MaxRowGroupLength);
            Assert.AreEqual(ParquetVersion.PARQUET_1_0, p.Version);
            Assert.AreEqual(666, p.WriteBatchSize);
        }
        private static WriterProperties CreateWriterProperties()
        {
            var builder = new WriterPropertiesBuilder();

            builder.Compression(Compression.Snappy);

            return(builder.Build());
        }
        private static WriterProperties CreateWriterProperties(FileEncryptionProperties fileEncryptionProperties)
        {
            using var builder = new WriterPropertiesBuilder();

            return(builder
                   .Compression(Compression.Lz4)
                   .Encryption(fileEncryptionProperties)
                   .Build());
        }
        public static void TestPropertiesBuilder()
        {
            var builder = new WriterPropertiesBuilder();

            builder
            .Compression(Compression.Snappy)
            .CreatedBy("Meeeee!!!")
            .DataPagesize(123)
            .DictionaryPagesizeLimit(456)
            .Encoding(Encoding.DeltaByteArray)
            .MaxRowGroupLength(789)
            .Version(ParquetVersion.PARQUET_1_0)
            .WriteBatchSize(666)
            ;

            var p = builder.Build();

            Assert.AreEqual("Meeeee!!!", p.CreatedBy);
            Assert.AreEqual(123, p.DataPageSize);
            Assert.AreEqual(Encoding.PlainDictionary, p.DictionaryIndexEncoding);
            Assert.AreEqual(Encoding.PlainDictionary, p.DictionaryPageEncoding);
            Assert.AreEqual(456, p.DictionaryPagesizeLimit);
            Assert.AreEqual(789, p.MaxRowGroupLength);
            Assert.AreEqual(ParquetVersion.PARQUET_1_0, p.Version);
            Assert.AreEqual(666, p.WriteBatchSize);

            /*
             * public WriterPropertiesBuilder DisableDictionary()
             * public WriterPropertiesBuilder DisableDictionary(string path)
             * public WriterPropertiesBuilder EnableDictionary()
             * public WriterPropertiesBuilder EnableDictionary(string path)
             *
             * // Statistics enable/disable
             *
             * public WriterPropertiesBuilder DisableStatistics()
             * public WriterPropertiesBuilder DisableStatistics(string path)
             * public WriterPropertiesBuilder EnableStatistics()
             * public WriterPropertiesBuilder EnableStatistics(string path)
             *
             * // Other properties
             *
             * public WriterPropertiesBuilder Compression(Compression codec)
             * public WriterPropertiesBuilder Compression(string path, Compression codec)
             * public WriterPropertiesBuilder CreatedBy(string createdBy)
             * public WriterPropertiesBuilder DataPagesize(long pageSize)
             * public WriterPropertiesBuilder DictionaryPagesizeLimit(long dictionaryPagesizeLimit)
             * public WriterPropertiesBuilder Encoding(Encoding encoding)
             * public WriterPropertiesBuilder Encoding(string path, Encoding encoding)
             * public WriterPropertiesBuilder MaxRowGroupLength(long maxRowGroupLength)
             * public WriterPropertiesBuilder Version(ParquetVersion version)
             * public WriterPropertiesBuilder WriteBatchSize(long writeBatchSize)
             */
        }
        public static void TestByteStreamSplitEncoding()
        {
            const int numRows = 10230;

            var ids    = Enumerable.Range(0, numRows).ToArray();
            var values = ids.Select(i => i / 3.14f).ToArray();

            using var buffer = new ResizableBuffer();

            using (var output = new BufferOutputStream(buffer))
            {
                var columns = new Column[]
                {
                    new Column <int>("id"),
                    new Column <float>("value")
                };

                var p = new WriterPropertiesBuilder()
                        .Compression(Compression.Lz4)
                        .DisableDictionary("value")
                        .Encoding("value", Encoding.ByteStreamSplit)
                        .Build();

                using var fileWriter  = new ParquetFileWriter(output, columns, p);
                using var groupWriter = fileWriter.AppendRowGroup();

                using var idWriter = groupWriter.NextColumn().LogicalWriter <int>();
                idWriter.WriteBatch(ids);

                using var valueWriter = groupWriter.NextColumn().LogicalWriter <float>();
                valueWriter.WriteBatch(values);

                fileWriter.Close();
            }

            using var input       = new BufferReader(buffer);
            using var fileReader  = new ParquetFileReader(input);
            using var groupReader = fileReader.RowGroup(0);

            using var metadataId    = groupReader.MetaData.GetColumnChunkMetaData(0);
            using var metadataValue = groupReader.MetaData.GetColumnChunkMetaData(1);

            Assert.AreEqual(new[] { Encoding.PlainDictionary, Encoding.Plain, Encoding.Rle }, metadataId.Encodings);
            Assert.AreEqual(new[] { Encoding.ByteStreamSplit, Encoding.Rle }, metadataValue.Encodings);

            using var idReader    = groupReader.Column(0).LogicalReader <int>();
            using var valueReader = groupReader.Column(1).LogicalReader <float>();

            Assert.AreEqual(ids, idReader.ReadAll(numRows));
            Assert.AreEqual(values, valueReader.ReadAll(numRows));
        }
        private static WriterProperties CreateWriterProperties(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding)
        {
            var builder = new WriterPropertiesBuilder();

            builder.Compression(Compression.Lz4);

            if (!useDictionaryEncoding)
            {
                foreach (var column in expectedColumns)
                {
                    builder.DisableDictionary(column.Name);
                }
            }

            return(builder.Build());
        }
 private static WriterProperties CreateWriterProperties(bool enableDictionary)
 {
     using var builder = new WriterPropertiesBuilder();
     builder.Compression(Compression.Snappy);
     return((enableDictionary ? builder : builder.DisableDictionary("value")).Build());
 }
        public static void TestBigFileBufferedRowGroup()
        {
            // Test a large amount of rows with a buffered row group to uncover any particular issue.
            const int numBatches = 64;
            const int batchSize  = 8192;

            using var buffer = new ResizableBuffer();

            using (var output = new BufferOutputStream(buffer))
            {
                var columns = new Column[]
                {
                    new Column <int>("int"),
                    new Column <double>("double"),
                    new Column <string>("string"),
                    new Column <bool>("bool")
                };

                using var builder          = new WriterPropertiesBuilder();
                using var writerProperties = builder.Compression(Compression.Snappy).DisableDictionary("double").Build();
                using var fileWriter       = new ParquetFileWriter(output, columns, writerProperties);
                using var rowGroupWriter   = fileWriter.AppendBufferedRowGroup();

                using var col0 = rowGroupWriter.Column(0).LogicalWriter <int>();
                using var col1 = rowGroupWriter.Column(1).LogicalWriter <double>();
                using var col2 = rowGroupWriter.Column(2).LogicalWriter <string>();
                using var col3 = rowGroupWriter.Column(3).LogicalWriter <bool>();

                for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex)
                {
                    var startIndex = batchSize * batchIndex;

                    col0.WriteBatch(Enumerable.Range(startIndex, batchSize).ToArray());
                    col1.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray());
                    col2.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray());
                    col3.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray());
                }

                fileWriter.Close();
            }

            using (var input = new BufferReader(buffer))
            {
                using var fileReader     = new ParquetFileReader(input);
                using var rowGroupReader = fileReader.RowGroup(0);

                using var col0 = rowGroupReader.Column(0).LogicalReader <int>();
                using var col1 = rowGroupReader.Column(1).LogicalReader <double>();
                using var col2 = rowGroupReader.Column(2).LogicalReader <string>();
                using var col3 = rowGroupReader.Column(3).LogicalReader <bool>();

                for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex)
                {
                    var startIndex = batchSize * batchIndex;

                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).ToArray(), col0.ReadAll(batchSize));
                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray(), col1.ReadAll(batchSize));
                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray(), col2.ReadAll(batchSize));
                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray(), col3.ReadAll(batchSize));
                }

                fileReader.Close();
            }
        }
Beispiel #9
0
        public static void TestProperties()
        {
            // Test the various properties exposed by ParquetFileWriter.

            using var writerPropertiesBuilder = new WriterPropertiesBuilder();
            using var writerProperties        = writerPropertiesBuilder
                                                .Compression(Compression.Zstd)
                                                .DisableDictionary()
                                                .CreatedBy("Some crazy unit test")
                                                .Build();

            var columns = new Column[]
            {
                new Column <int>("Index"),
                new Column <float>("Value")
            };

            var kvm = (IReadOnlyDictionary <string, string>) new Dictionary <string, string>
            {
                { "some key", "some value" },
                { "α ∧ ¬β", "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm" }
            };

            using var buffer     = new ResizableBuffer();
            using var outStream  = new BufferOutputStream(buffer);
            using var fileWriter = new ParquetFileWriter(outStream, columns, writerProperties, kvm);

            Assert.AreEqual(2, fileWriter.NumColumns);
            Assert.AreEqual(0, fileWriter.NumRows);
            Assert.AreEqual(0, fileWriter.NumRowGroups);
            Assert.IsNull(fileWriter.FileMetaData);
            Assert.AreEqual(Column.CreateSchemaNode(columns), fileWriter.Schema.GroupNode);
            Assert.AreEqual(Column.CreateSchemaNode(columns), fileWriter.Schema.SchemaRoot);
            Assert.AreEqual(columns[0].Name, fileWriter.ColumnDescriptor(0).Name);
            Assert.AreEqual(columns[1].Name, fileWriter.ColumnDescriptor(1).Name);
            Assert.AreEqual(kvm, fileWriter.KeyValueMetadata);

            Assert.AreEqual(Compression.Zstd, fileWriter.WriterProperties.Compression(new ColumnPath("")));
            Assert.AreEqual(false, fileWriter.WriterProperties.DictionaryEnabled(new ColumnPath("")));
            Assert.AreEqual("Some crazy unit test", fileWriter.WriterProperties.CreatedBy);

            using (var groupWriter = fileWriter.AppendRowGroup())
            {
                Assert.AreEqual(0, fileWriter.NumRows);
                Assert.AreEqual(1, fileWriter.NumRowGroups);
                Assert.IsNull(fileWriter.FileMetaData);

                using (var writer = groupWriter.NextColumn().LogicalWriter <int>())
                {
                    writer.WriteBatch(new[] { 1, 2, 3, 4, 5, 6 });
                }

                Assert.AreEqual(0, fileWriter.NumRows);
                Assert.AreEqual(1, fileWriter.NumRowGroups);
                Assert.IsNull(fileWriter.FileMetaData);

                using (var writer = groupWriter.NextColumn().LogicalWriter <float>())
                {
                    writer.WriteBatch(new[] { 1f, 2f, 3f, 4f, 5f, 6f });
                }

                Assert.AreEqual(0, fileWriter.NumRows);
                Assert.AreEqual(1, fileWriter.NumRowGroups);
                Assert.IsNull(fileWriter.FileMetaData);
            }

            Assert.AreEqual(0, fileWriter.NumRows);
            Assert.AreEqual(1, fileWriter.NumRowGroups);
            Assert.IsNull(fileWriter.FileMetaData);

            fileWriter.Close();

            //Assert.AreEqual(0, fileWriter.NumRows); // 2021-04-08: calling this results in a segfault when the writer has been closed
            //Assert.AreEqual(1, fileWriter.NumRowGroups); // 2021-04-08: calling this results in a segfault when the writer has been closed
            Assert.IsNotNull(fileWriter.FileMetaData);
            Assert.AreEqual(2, fileWriter.FileMetaData?.NumColumns);
            Assert.AreEqual(6, fileWriter.FileMetaData?.NumRows);
            Assert.AreEqual(1, fileWriter.FileMetaData?.NumRowGroups);
            Assert.AreEqual(kvm, fileWriter.FileMetaData?.KeyValueMetadata);
        }