public static void TestArrayEdgeCasesRoundtrip() { /* * [None, [], [1.0, None, 2.0]] * [] * None * [[]] */ var expected = new double?[][][] { new double?[][] { null, new double?[] { }, new double?[] { 1.0, null, 2.0 } }, new double?[][] { }, null, new double?[][] { new double?[] { } } }; using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <double?[][]>("a") }); using var rowGroupWriter = fileWriter.AppendRowGroup(); using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <double?[][]>(); colWriter.WriteBatch(expected); fileWriter.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <double?[][]>(); Assert.AreEqual(4, rowGroup.MetaData.NumRows); var allData = columnReader.ReadAll(4); Assert.AreEqual(expected, allData); }
public static void TestDisposeExceptionSafety_ColumnWriter() { var exception = Assert.Throws <Exception>(() => { using var buffer = new ResizableBuffer(); using var outStream = new BufferOutputStream(buffer); using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("Index"), new Column <float>("Value") }); using var groupWriter = fileWriter.AppendRowGroup(); using (var writer = groupWriter.NextColumn().LogicalWriter <int>()) { writer.WriteBatch(new[] { 1, 2, 3, 4, 5, 6 }); } using (var writer = groupWriter.NextColumn().LogicalWriter <float>()) { throw new Exception("this is the expected message"); } }); Assert.That(exception?.Message, Contains.Substring("this is the expected message")); }
public static unsafe void TestParquetReadFromBuffer() { var expected = Enumerable.Range(0, 100).ToArray(); // Write out a single column byte[] parquetFileBytes; using (var outBuffer = new ResizableBuffer()) { using (var outStream = new BufferOutputStream(outBuffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") }); using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } parquetFileBytes = outBuffer.ToArray(); } // Read it back fixed(byte *fixedBytes = parquetFileBytes) { using var buffer = new IO.Buffer(new IntPtr(fixedBytes), parquetFileBytes.Length); using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <int>(); var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows); Assert.AreEqual(expected, allData); } }
public static void TestRoundTrip( // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0 [Values(2, 8, 32, 128)] int rowsPerBatch, [Values(7, 49, 343, 2401)] int writeBufferLength, [Values(11, 121, 1331)] int readBufferLength, [Values(true, false)] bool useDictionaryEncoding ) { var expectedColumns = CreateExpectedColumns(); var schemaColumns = expectedColumns .Select(c => new Column(c.Values.GetType().GetElementType() ?? throw new InvalidOperationException(), c.Name, c.LogicalTypeOverride)) .ToArray(); using var buffer = new ResizableBuffer(); // Write our expected columns to the parquet in-memory file. using (var outStream = new BufferOutputStream(buffer)) { using var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding); using var fileWriter = new ParquetFileWriter(outStream, schemaColumns, writerProperties); using var rowGroupWriter = fileWriter.AppendRowGroup(); foreach (var column in expectedColumns) { Console.WriteLine("Writing '{0}' ({1})", column.Name, column.Values.GetType().GetElementType()); using var columnWriter = rowGroupWriter.NextColumn().LogicalWriter(writeBufferLength); columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch)); } fileWriter.Close(); } Console.WriteLine(); // Read back the columns and make sure they match. AssertReadRoundtrip(rowsPerBatch, readBufferLength, buffer, expectedColumns); }
private static ResizableBuffer WriteTestValues <TValue>(TValue[] written) { var buffer = new ResizableBuffer(); try { using var output = new BufferOutputStream(buffer); using var fileWriter = new ParquetFileWriter(output, new Column[] { new Column <TValue>("values") }); using var groupWriter = fileWriter.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <TValue>(); columnWriter.WriteBatch(written); fileWriter.Close(); return(buffer); } catch { buffer.Dispose(); throw; } }
private static void TestWriteNoWriterOverride <TValue, TCustom>(TValue[] expected, TCustom[] written) { using var buffer = new ResizableBuffer(); // Write float values using a custom user-type: // - Provide a type factory such that Column<VolumeInDollars> can be converted to the right schema node. // - Provide a converter factory such that VolumeInDollars values can be written as floats. // - Do not explicitly override the expected type when accessing the LogicalColumnWriter. using (var output = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(output, new Column[] { new Column <TCustom>("values") }, new WriteTypeFactory()) { LogicalWriteConverterFactory = new WriteConverterFactory() }; using var groupWriter = fileWriter.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <TCustom>(); columnWriter.WriteBatch(written); fileWriter.Close(); } CheckWrittenValues(buffer, expected); }
public static void TestReaderWriteTypes( // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0 [Values(2, 8, 32, 128)] int rowsPerBatch, [Values(7, 49, 343, 2401)] int writeBufferLength, [Values(11, 121, 1331)] int readBufferLength ) { var expectedColumns = CreateExpectedColumns(); var schemaColumns = expectedColumns.Select(c => new Column(c.Values.GetType().GetElementType(), c.Name, c.LogicalTypeOverride)).ToArray(); using var buffer = new ResizableBuffer(); // Write our expected columns to the parquet in-memory file. using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, schemaColumns); using (var rowGroupWriter = fileWriter.AppendRowGroup()) { foreach (var column in expectedColumns) { Console.WriteLine("Writing '{0}' ({1})", column.Name, column.Values.GetType().GetElementType()); using var columnWriter = rowGroupWriter.NextColumn().LogicalWriter(writeBufferLength); columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch)); } } fileWriter.Close(); } Console.WriteLine(); // Read back the columns and make sure they match. using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var fileMetaData = fileReader.FileMetaData; using var rowGroupReader = fileReader.RowGroup(0); var rowGroupMetaData = rowGroupReader.MetaData; var numRows = rowGroupMetaData.NumRows; for (int c = 0; c != fileMetaData.NumColumns; ++c) { var expected = expectedColumns[c]; // Test properties, and read methods. using (var columnReader = rowGroupReader.Column(c).LogicalReader(readBufferLength)) { var descr = columnReader.ColumnDescriptor; var chunkMetaData = rowGroupMetaData.GetColumnChunkMetaData(c); var statistics = chunkMetaData.Statistics; Console.WriteLine("Reading '{0}'", expected.Name); Assert.AreEqual(expected.PhysicalType, descr.PhysicalType); Assert.AreEqual(expected.LogicalType, descr.LogicalType); Assert.AreEqual(expected.Values, columnReader.Apply(new LogicalValueGetter(checked ((int)numRows), rowsPerBatch))); Assert.AreEqual(expected.Length, descr.TypeLength); Assert.AreEqual((expected.LogicalType as DecimalLogicalType)?.Precision ?? -1, descr.TypePrecision); Assert.AreEqual((expected.LogicalType as DecimalLogicalType)?.Scale ?? -1, descr.TypeScale); Assert.AreEqual(expected.HasStatistics, chunkMetaData.IsStatsSet); if (expected.HasStatistics) { Assert.AreEqual(expected.HasMinMax, statistics.HasMinMax); //Assert.AreEqual(expected.NullCount, statistics.NullCount); //Assert.AreEqual(expected.NumValues, statistics.NumValues); Assert.AreEqual(expected.PhysicalType, statistics.PhysicalType); // BUG Don't check for decimal until https://issues.apache.org/jira/browse/ARROW-6149 is fixed. var buggy = expected.LogicalType is DecimalLogicalType; if (expected.HasMinMax && !buggy) { Assert.AreEqual(expected.Min, expected.Converter(statistics.MinUntyped)); Assert.AreEqual(expected.Max, expected.Converter(statistics.MaxUntyped)); } } else { Assert.IsNull(statistics); } } // Test IEnumerable interface using (var columnReader = rowGroupReader.Column(c).LogicalReader(readBufferLength)) { Assert.AreEqual(expected.Values, columnReader.Apply(new LogicalColumnReaderToArray())); } } }
public static void TestReaderWriteTypes() { var schema = CreateSchema(); var writerProperties = CreateWriterProperties(); var keyValueMetadata = new Dictionary <string, string> { { "case", "Test" }, { "Awesome", "true" } }; var expectedColumns = CreateExpectedColumns(); using (var buffer = new ResizableBuffer()) { // Write our expected columns to the parquet in-memory file. using (var outStream = new BufferOutputStream(buffer)) using (var fileWriter = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata)) using (var rowGroupWriter = fileWriter.AppendRowGroup()) { foreach (var column in expectedColumns) { Console.WriteLine("Writing '{0}'", column.Name); using (var columnWriter = rowGroupWriter.NextColumn()) { columnWriter.Apply(new ValueSetter(column.Values)); } } } // Read back the columns and make sure they match. using (var inStream = new BufferReader(buffer)) using (var fileReader = new ParquetFileReader(inStream)) { var fileMetaData = fileReader.FileMetaData; Assert.AreEqual("parquet-cpp version 1.5.1-SNAPSHOT", fileMetaData.CreatedBy); Assert.AreEqual(new Dictionary <string, string> { { "case", "Test" }, { "Awesome", "true" } }, fileMetaData.KeyValueMetadata); Assert.AreEqual(expectedColumns.Length, fileMetaData.NumColumns); Assert.AreEqual(NumRows, fileMetaData.NumRows); Assert.AreEqual(1, fileMetaData.NumRowGroups); Assert.AreEqual(1 + expectedColumns.Length, fileMetaData.NumSchemaElements); Assert.AreEqual(ParquetVersion.PARQUET_1_0, fileMetaData.Version); Assert.AreEqual("parquet-cpp version 1.5.1", fileMetaData.WriterVersion.ToString()); using (var rowGroupReader = fileReader.RowGroup(0)) { var rowGroupMetaData = rowGroupReader.MetaData; for (int c = 0; c != fileMetaData.NumColumns; ++c) { using (var columnReader = rowGroupReader.Column(c)) { var expected = expectedColumns[c]; Console.WriteLine("Reading '{0}'", expected.Name); var descr = columnReader.ColumnDescriptor; var chunkMetaData = rowGroupMetaData.GetColumnChunkMetaData(0); Assert.AreEqual(expected.MaxDefinitionlevel, descr.MaxDefinitionLevel); Assert.AreEqual(expected.MaxRepetitionLevel, descr.MaxRepetitionLevel); Assert.AreEqual(expected.PhysicalType, descr.PhysicalType); Assert.AreEqual(expected.LogicalType, descr.LogicalType); Assert.AreEqual(expected.ColumnOrder, descr.ColumnOrder); Assert.AreEqual(expected.SortOrder, descr.SortOrder); Assert.AreEqual(expected.Name, descr.Name); Assert.AreEqual(expected.TypeLength, descr.TypeLength); Assert.AreEqual(expected.TypePrecision, descr.TypePrecision); Assert.AreEqual(expected.TypeScale, descr.TypeScale); Assert.AreEqual(expected.Encodings, chunkMetaData.Encodings); Assert.AreEqual(expected.Compression, chunkMetaData.Compression); Assert.AreEqual(expected.Values, columnReader.Apply(new PhysicalValueGetter(chunkMetaData.NumValues)).values); } } } } } }
public static void TestBigFileBufferedRowGroup() { // Test a large amount of rows with a buffered row group to uncover any particular issue. const int numBatches = 64; const int batchSize = 8192; using var buffer = new ResizableBuffer(); using (var output = new BufferOutputStream(buffer)) { var columns = new Column[] { new Column <int>("int"), new Column <double>("double"), new Column <string>("string"), new Column <bool>("bool") }; using var builder = new WriterPropertiesBuilder(); using var writerProperties = builder.Compression(Compression.Snappy).DisableDictionary("double").Build(); using var fileWriter = new ParquetFileWriter(output, columns, writerProperties); using var rowGroupWriter = fileWriter.AppendBufferedRowGroup(); using var col0 = rowGroupWriter.Column(0).LogicalWriter <int>(); using var col1 = rowGroupWriter.Column(1).LogicalWriter <double>(); using var col2 = rowGroupWriter.Column(2).LogicalWriter <string>(); using var col3 = rowGroupWriter.Column(3).LogicalWriter <bool>(); for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex) { var startIndex = batchSize * batchIndex; col0.WriteBatch(Enumerable.Range(startIndex, batchSize).ToArray()); col1.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray()); col2.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray()); col3.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray()); } fileWriter.Close(); } using (var input = new BufferReader(buffer)) { using var fileReader = new ParquetFileReader(input); using var rowGroupReader = fileReader.RowGroup(0); using var col0 = rowGroupReader.Column(0).LogicalReader <int>(); using var col1 = rowGroupReader.Column(1).LogicalReader <double>(); using var col2 = rowGroupReader.Column(2).LogicalReader <string>(); using var col3 = rowGroupReader.Column(3).LogicalReader <bool>(); for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex) { var startIndex = batchSize * batchIndex; Assert.AreEqual(Enumerable.Range(startIndex, batchSize).ToArray(), col0.ReadAll(batchSize)); Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray(), col1.ReadAll(batchSize)); Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray(), col2.ReadAll(batchSize)); Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray(), col3.ReadAll(batchSize)); } fileReader.Close(); } }
public static void TestProperties() { // Test the various properties exposed by ParquetFileWriter. using var writerPropertiesBuilder = new WriterPropertiesBuilder(); using var writerProperties = writerPropertiesBuilder .Compression(Compression.Zstd) .DisableDictionary() .CreatedBy("Some crazy unit test") .Build(); var columns = new Column[] { new Column <int>("Index"), new Column <float>("Value") }; var kvm = (IReadOnlyDictionary <string, string>) new Dictionary <string, string> { { "some key", "some value" }, { "α ∧ ¬β", "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm" } }; using var buffer = new ResizableBuffer(); using var outStream = new BufferOutputStream(buffer); using var fileWriter = new ParquetFileWriter(outStream, columns, writerProperties, kvm); Assert.AreEqual(2, fileWriter.NumColumns); Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(0, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); Assert.AreEqual(Column.CreateSchemaNode(columns), fileWriter.Schema.GroupNode); Assert.AreEqual(Column.CreateSchemaNode(columns), fileWriter.Schema.SchemaRoot); Assert.AreEqual(columns[0].Name, fileWriter.ColumnDescriptor(0).Name); Assert.AreEqual(columns[1].Name, fileWriter.ColumnDescriptor(1).Name); Assert.AreEqual(kvm, fileWriter.KeyValueMetadata); Assert.AreEqual(Compression.Zstd, fileWriter.WriterProperties.Compression(new ColumnPath(""))); Assert.AreEqual(false, fileWriter.WriterProperties.DictionaryEnabled(new ColumnPath(""))); Assert.AreEqual("Some crazy unit test", fileWriter.WriterProperties.CreatedBy); using (var groupWriter = fileWriter.AppendRowGroup()) { Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(1, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); using (var writer = groupWriter.NextColumn().LogicalWriter <int>()) { writer.WriteBatch(new[] { 1, 2, 3, 4, 5, 6 }); } Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(1, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); using (var writer = groupWriter.NextColumn().LogicalWriter <float>()) { writer.WriteBatch(new[] { 1f, 2f, 3f, 4f, 5f, 6f }); } Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(1, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); } Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(1, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); fileWriter.Close(); //Assert.AreEqual(0, fileWriter.NumRows); // 2021-04-08: calling this results in a segfault when the writer has been closed //Assert.AreEqual(1, fileWriter.NumRowGroups); // 2021-04-08: calling this results in a segfault when the writer has been closed Assert.IsNotNull(fileWriter.FileMetaData); Assert.AreEqual(2, fileWriter.FileMetaData?.NumColumns); Assert.AreEqual(6, fileWriter.FileMetaData?.NumRows); Assert.AreEqual(1, fileWriter.FileMetaData?.NumRowGroups); Assert.AreEqual(kvm, fileWriter.FileMetaData?.KeyValueMetadata); }
public UtilStreamBuffer(int initialsize) { buf = new byte[initialsize]; inputStream = new BufferInputStream(); outputStream = new BufferOutputStream(); }