public static void TestRoundTripBuffered( // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0 [Values(2, 8, 32, 128)] int rowsPerBatch, [Values(7, 49, 343, 2401)] int writeBufferLength, [Values(11, 121, 1331)] int readBufferLength, [Values(true, false)] bool useDictionaryEncoding ) { var expectedColumns = CreateExpectedColumns(); var schemaColumns = expectedColumns .Select(c => new Column(c.Values.GetType().GetElementType() ?? throw new InvalidOperationException(), c.Name, c.LogicalTypeOverride)) .ToArray(); using var buffer = new ResizableBuffer(); // Write our expected columns to the parquet in-memory file. using (var outStream = new BufferOutputStream(buffer)) { using var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding); using var fileWriter = new ParquetFileWriter(outStream, schemaColumns, writerProperties); using var rowGroupWriter = fileWriter.AppendBufferedRowGroup(); const int rangeLength = 9; for (int r = 0; r < NumRows; r += rangeLength) { for (var i = 0; i < expectedColumns.Length; i++) { var column = expectedColumns[i]; var range = (r, Math.Min(r + rangeLength, NumRows)); Console.WriteLine("Writing '{0}' (element type: {1}) (range: {2})", column.Name, column.Values.GetType().GetElementType(), range); using var columnWriter = rowGroupWriter.Column(i).LogicalWriter(writeBufferLength); columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch, range)); } } fileWriter.Close(); } Console.WriteLine(); // Read back the columns and make sure they match. AssertReadRoundtrip(rowsPerBatch, readBufferLength, buffer, expectedColumns); }
private static void TestRoundTripBuffered(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding) { // Same as the default round-trip test, but use buffered row groups. var schema = CreateSchema(expectedColumns); var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding); var keyValueMetadata = new Dictionary <string, string> { { "case", "Test" }, { "Awesome", "true" } }; using var buffer = new ResizableBuffer(); // Write our expected columns to the parquet in-memory file. using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata); using var rowGroupWriter = fileWriter.AppendBufferedRowGroup(); const int rangeLength = 9; var numRows = expectedColumns.First().Values.Length; for (int r = 0; r < numRows; r += rangeLength) { for (var i = 0; i < expectedColumns.Length; i++) { var column = expectedColumns[i]; var range = (r, Math.Min(r + rangeLength, numRows)); if (range.Item1 == 0 || range.Item2 == numRows) { Console.WriteLine("Writing '{0}' (range: {1})", column.Name, range); } using var columnWriter = rowGroupWriter.Column(i); columnWriter.Apply(new ValueSetter(column.Values, range)); } } fileWriter.Close(); } // Read back the columns and make sure they match. AssertReadRoundtrip(buffer, expectedColumns, useDictionaryEncoding); }
public static void TestBigFileBufferedRowGroup() { // Test a large amount of rows with a buffered row group to uncover any particular issue. const int numBatches = 64; const int batchSize = 8192; using var buffer = new ResizableBuffer(); using (var output = new BufferOutputStream(buffer)) { var columns = new Column[] { new Column <int>("int"), new Column <double>("double"), new Column <string>("string"), new Column <bool>("bool") }; using var builder = new WriterPropertiesBuilder(); using var writerProperties = builder.Compression(Compression.Snappy).DisableDictionary("double").Build(); using var fileWriter = new ParquetFileWriter(output, columns, writerProperties); using var rowGroupWriter = fileWriter.AppendBufferedRowGroup(); using var col0 = rowGroupWriter.Column(0).LogicalWriter <int>(); using var col1 = rowGroupWriter.Column(1).LogicalWriter <double>(); using var col2 = rowGroupWriter.Column(2).LogicalWriter <string>(); using var col3 = rowGroupWriter.Column(3).LogicalWriter <bool>(); for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex) { var startIndex = batchSize * batchIndex; col0.WriteBatch(Enumerable.Range(startIndex, batchSize).ToArray()); col1.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray()); col2.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray()); col3.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray()); } fileWriter.Close(); } using (var input = new BufferReader(buffer)) { using var fileReader = new ParquetFileReader(input); using var rowGroupReader = fileReader.RowGroup(0); using var col0 = rowGroupReader.Column(0).LogicalReader <int>(); using var col1 = rowGroupReader.Column(1).LogicalReader <double>(); using var col2 = rowGroupReader.Column(2).LogicalReader <string>(); using var col3 = rowGroupReader.Column(3).LogicalReader <bool>(); for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex) { var startIndex = batchSize * batchIndex; Assert.AreEqual(Enumerable.Range(startIndex, batchSize).ToArray(), col0.ReadAll(batchSize)); Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray(), col1.ReadAll(batchSize)); Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray(), col2.ReadAll(batchSize)); Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray(), col3.ReadAll(batchSize)); } fileReader.Close(); } }