Exemple #1
0
        public static void TestRoundTripBuffered(
            // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0
            [Values(2, 8, 32, 128)] int rowsPerBatch,
            [Values(7, 49, 343, 2401)] int writeBufferLength,
            [Values(11, 121, 1331)] int readBufferLength,
            [Values(true, false)] bool useDictionaryEncoding
            )
        {
            var expectedColumns = CreateExpectedColumns();
            var schemaColumns   = expectedColumns
                                  .Select(c => new Column(c.Values.GetType().GetElementType() ?? throw new InvalidOperationException(), c.Name, c.LogicalTypeOverride))
                                  .ToArray();

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding);
                using var fileWriter       = new ParquetFileWriter(outStream, schemaColumns, writerProperties);
                using var rowGroupWriter   = fileWriter.AppendBufferedRowGroup();

                const int rangeLength = 9;

                for (int r = 0; r < NumRows; r += rangeLength)
                {
                    for (var i = 0; i < expectedColumns.Length; i++)
                    {
                        var column = expectedColumns[i];
                        var range  = (r, Math.Min(r + rangeLength, NumRows));

                        Console.WriteLine("Writing '{0}' (element type: {1}) (range: {2})", column.Name, column.Values.GetType().GetElementType(), range);

                        using var columnWriter = rowGroupWriter.Column(i).LogicalWriter(writeBufferLength);
                        columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch, range));
                    }
                }

                fileWriter.Close();
            }

            Console.WriteLine();

            // Read back the columns and make sure they match.
            AssertReadRoundtrip(rowsPerBatch, readBufferLength, buffer, expectedColumns);
        }
        private static void TestRoundTripBuffered(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding)
        {
            // Same as the default round-trip test, but use buffered row groups.

            var schema           = CreateSchema(expectedColumns);
            var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding);
            var keyValueMetadata = new Dictionary <string, string> {
                { "case", "Test" }, { "Awesome", "true" }
            };

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata);
                using var rowGroupWriter = fileWriter.AppendBufferedRowGroup();

                const int rangeLength = 9;
                var       numRows     = expectedColumns.First().Values.Length;

                for (int r = 0; r < numRows; r += rangeLength)
                {
                    for (var i = 0; i < expectedColumns.Length; i++)
                    {
                        var column = expectedColumns[i];
                        var range  = (r, Math.Min(r + rangeLength, numRows));

                        if (range.Item1 == 0 || range.Item2 == numRows)
                        {
                            Console.WriteLine("Writing '{0}' (range: {1})", column.Name, range);
                        }

                        using var columnWriter = rowGroupWriter.Column(i);
                        columnWriter.Apply(new ValueSetter(column.Values, range));
                    }
                }

                fileWriter.Close();
            }

            // Read back the columns and make sure they match.
            AssertReadRoundtrip(buffer, expectedColumns, useDictionaryEncoding);
        }
        public static void TestBigFileBufferedRowGroup()
        {
            // Test a large amount of rows with a buffered row group to uncover any particular issue.
            const int numBatches = 64;
            const int batchSize  = 8192;

            using var buffer = new ResizableBuffer();

            using (var output = new BufferOutputStream(buffer))
            {
                var columns = new Column[]
                {
                    new Column <int>("int"),
                    new Column <double>("double"),
                    new Column <string>("string"),
                    new Column <bool>("bool")
                };

                using var builder          = new WriterPropertiesBuilder();
                using var writerProperties = builder.Compression(Compression.Snappy).DisableDictionary("double").Build();
                using var fileWriter       = new ParquetFileWriter(output, columns, writerProperties);
                using var rowGroupWriter   = fileWriter.AppendBufferedRowGroup();

                using var col0 = rowGroupWriter.Column(0).LogicalWriter <int>();
                using var col1 = rowGroupWriter.Column(1).LogicalWriter <double>();
                using var col2 = rowGroupWriter.Column(2).LogicalWriter <string>();
                using var col3 = rowGroupWriter.Column(3).LogicalWriter <bool>();

                for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex)
                {
                    var startIndex = batchSize * batchIndex;

                    col0.WriteBatch(Enumerable.Range(startIndex, batchSize).ToArray());
                    col1.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray());
                    col2.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray());
                    col3.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray());
                }

                fileWriter.Close();
            }

            using (var input = new BufferReader(buffer))
            {
                using var fileReader     = new ParquetFileReader(input);
                using var rowGroupReader = fileReader.RowGroup(0);

                using var col0 = rowGroupReader.Column(0).LogicalReader <int>();
                using var col1 = rowGroupReader.Column(1).LogicalReader <double>();
                using var col2 = rowGroupReader.Column(2).LogicalReader <string>();
                using var col3 = rowGroupReader.Column(3).LogicalReader <bool>();

                for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex)
                {
                    var startIndex = batchSize * batchIndex;

                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).ToArray(), col0.ReadAll(batchSize));
                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray(), col1.ReadAll(batchSize));
                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray(), col2.ReadAll(batchSize));
                    Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray(), col3.ReadAll(batchSize));
                }

                fileReader.Close();
            }
        }