public (DateTime[] dateTimes, int[] objectIds, float[] values) ParquetSharp() { using var fileReader = new ParquetFileReader(Filename); using var groupReader = fileReader.RowGroup(0); DateTime[] dateTimes; using (var dateTimeReader = groupReader.Column(0).LogicalReader <DateTime>()) { dateTimes = dateTimeReader.ReadAll(_numRows); } int[] objectIds; using (var objectIdReader = groupReader.Column(1).LogicalReader <int>()) { objectIds = objectIdReader.ReadAll(_numRows); } float[] values; using (var valueReader = groupReader.Column(2).LogicalReader <float>()) { values = valueReader.ReadAll(_numRows); } fileReader.Close(); if (Check.Enabled) { Check.ArraysAreEqual(_allDates, dateTimes); Check.ArraysAreEqual(_allObjectIds, objectIds); Check.ArraysAreEqual(_allValues, values); } return(dateTimes, objectIds, values); }
public static void TestHasNext() { const int numRows = 5; var schemaColumns = new Column[] { new Column <int>("int32_field") }; var values = Enumerable.Range(0, numRows).ToArray(); using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var writer = new ParquetFileWriter(outStream, schemaColumns); using var rowGroupWriter = writer.AppendRowGroup(); using var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn(); colWriter.WriteBatch(values); writer.Close(); } // Read back the columns and make sure they match. using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroupReader = fileReader.RowGroup(0); using var column = (ColumnReader <int>)rowGroupReader.Column(0); var read = new int[1024]; column.ReadBatch(1024, read, out var numValues); Assert.AreEqual(numValues, numRows); Assert.AreEqual(values, read.AsSpan(0, numRows).ToArray()); Assert.IsFalse(column.HasNext); }
public static void TestInMemoryRoundTrip() { var expected = Enumerable.Range(0, 1024 * 1024).ToArray(); using var buffer = new MemoryStream(); // Write test data. using (var output = new ManagedOutputStream(buffer, leaveOpen: true)) { using var writer = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") }); using var groupWriter = writer.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>(); columnWriter.WriteBatch(expected); writer.Close(); } // Seek back to start. buffer.Seek(0, SeekOrigin.Begin); // Read test data. using var input = new ManagedRandomAccessFile(buffer, leaveOpen: true); using var reader = new ParquetFileReader(input); using var groupReader = reader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <int>(); Assert.AreEqual(expected, columnReader.ReadAll(expected.Length)); }
public static void TestFileHandleHasBeenReleased() { var exception = Assert.Throws <InvalidCastException>(() => { try { using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") })) { using var groupWriter = writer.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>(); columnWriter.WriteBatch(new[] { 1, 2, 3 }); writer.Close(); } // Open with the wrong logical reader type on purpose. using var reader = new ParquetFileReader("file.parquet"); using var groupReader = reader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <float>(); Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3)); } finally { // This will throw on Windows if the file handle has not been released. File.Delete("file.parquet"); } }); StringAssert.StartsWith("Unable to cast object of type", exception?.Message); }
public static void TestArrayOfEmptyStringArraysRoundtrip() { var expected = new[] { new string[] { }, new string[] { }, new string[] { }, new string[] { } }; using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string[]>("a") }); using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <string[]>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <string[]>(); Assert.AreEqual(4, rowGroup.MetaData.NumRows); var allData = columnReader.ReadAll(4); Assert.AreEqual(expected, allData); }
public static void TestFileStreamRoundTrip() { try { using (var output = new ManagedOutputStream(File.OpenWrite("file.parquet"))) { using var writer = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") }); using var groupWriter = writer.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>(); columnWriter.WriteBatch(new[] { 1, 2, 3 }); writer.Close(); } using var input = new ManagedRandomAccessFile(File.OpenRead("file.parquet")); using var reader = new ParquetFileReader(input); using var groupReader = reader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <int>(); Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3)); } finally { File.Delete("file.parquet"); } }
public static void TestFileHandleHasBeenReleased() { var exception = Assert.Throws <InvalidCastException>(() => { try { using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") })) using (var group = writer.AppendRowGroup()) using (var column = group.NextColumn().LogicalWriter <int>()) { column.WriteBatch(new[] { 1, 2, 3 }); } // Open with the wrong logical reader type on purpose. using (var reader = new ParquetFileReader("file.parquet")) using (var group = reader.RowGroup(0)) using (var column = group.Column(0).LogicalReader <float>()) { Assert.AreEqual(new[] { 1, 2, 3 }, column.ReadAll(3)); } } finally { // This will throw on Windows if the file handle has not been released. File.Delete("file.parquet"); } }); Assert.AreEqual( "Unable to cast object of type " + "'ParquetSharp.LogicalColumnReader`3[System.Int32,System.Int32,System.Int32]'" + " to type 'ParquetSharp.LogicalColumnReader`1[System.Single]'.", exception.Message); }
public static unsafe void TestParquetReadFromBuffer() { var expected = Enumerable.Range(0, 100).ToArray(); // Write out a single column byte[] parquetFileBytes; using (var outBuffer = new ResizableBuffer()) { using (var outStream = new BufferOutputStream(outBuffer)) using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") })) using (var rowGroupWriter = fileWriter.AppendRowGroup()) using (var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { colWriter.WriteBatch(expected); } parquetFileBytes = outBuffer.ToArray(); } // Read it back fixed(byte *fixedBytes = parquetFileBytes) using (var buffer = new IO.Buffer(new IntPtr(fixedBytes), parquetFileBytes.Length)) using (var inStream = new BufferReader(buffer)) using (var fileReader = new ParquetFileReader(inStream)) using (var rowGroup = fileReader.RowGroup(0)) using (var columnReader = rowGroup.Column(0).LogicalReader <int>()) { var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows); Assert.AreEqual(expected, allData); } }
private static void AssertReadRoundtrip(int rowsPerBatch, int readBufferLength, ResizableBuffer buffer, ExpectedColumn[] expectedColumns) { using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var fileMetaData = fileReader.FileMetaData; using var rowGroupReader = fileReader.RowGroup(0); var rowGroupMetaData = rowGroupReader.MetaData; var numRows = rowGroupMetaData.NumRows; for (int c = 0; c != fileMetaData.NumColumns; ++c) { var expected = expectedColumns[c]; // Test properties, and read methods. using (var columnReader = rowGroupReader.Column(c).LogicalReader(readBufferLength)) { var descr = columnReader.ColumnDescriptor; var chunkMetaData = rowGroupMetaData.GetColumnChunkMetaData(c); var statistics = chunkMetaData.Statistics; Console.WriteLine("Reading '{0}'", expected.Name); Assert.AreEqual(expected.PhysicalType, descr.PhysicalType); Assert.AreEqual(expected.LogicalType, descr.LogicalType); Assert.AreEqual(expected.Values, columnReader.Apply(new LogicalValueGetter(checked ((int)numRows), rowsPerBatch))); Assert.AreEqual(expected.Length, descr.TypeLength); Assert.AreEqual((expected.LogicalType as DecimalLogicalType)?.Precision ?? -1, descr.TypePrecision); Assert.AreEqual((expected.LogicalType as DecimalLogicalType)?.Scale ?? -1, descr.TypeScale); Assert.AreEqual(expected.HasStatistics, chunkMetaData.IsStatsSet); if (expected.HasStatistics) { Assert.AreEqual(expected.HasMinMax, statistics.HasMinMax); //Assert.AreEqual(expected.NullCount, statistics.NullCount); //Assert.AreEqual(expected.NumValues, statistics.NumValues); Assert.AreEqual(expected.PhysicalType, statistics.PhysicalType); // BUG Don't check for decimal until https://issues.apache.org/jira/browse/ARROW-6149 is fixed. var buggy = expected.LogicalType is DecimalLogicalType; if (expected.HasMinMax && !buggy) { Assert.AreEqual(expected.Min, expected.Converter(statistics.MinUntyped)); Assert.AreEqual(expected.Max, expected.Converter(statistics.MaxUntyped)); } } else { Assert.IsNull(statistics); } } // Test IEnumerable interface using (var columnReader = rowGroupReader.Column(c).LogicalReader(readBufferLength)) { Assert.AreEqual(expected.Values, columnReader.Apply(new LogicalColumnReaderToArray())); } } }
public static void TestWriteLongString() { const int numStrings = 100; // Generate lots of digits of 0.1234567891011121131415... var strings = Enumerable.Range(0, numStrings).Select(i => "0." + string.Join("", Enumerable.Range(1, 3500).Select(j => j.ToString())) + "...").ToArray(); using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") }); using var groupWriter = fileWriter.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>(); // Strings to byte arrays memory pooling is done by the ByteBuffer class. // If something is fishy there (e.g. bad memory ownership wrt the GC), // we expect to see consequences here if we write enough strings. // It's not bullet proof, but it has found a few issues. columnWriter.WriteBatch(strings); fileWriter.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var groupReader = fileReader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <string>(); Assert.AreEqual(strings, columnReader.ReadAll(numStrings)); }
public static void Demo() { // open input and output file streams Stream inputFile = File.OpenRead(".\\ResourceFiles\\userdata1.parquet"); Stream outputFile = File.OpenWrite(".\\ResourceFiles\\out1.parquet"); // Create reader using ParquetFileReader reader = new ParquetFileReader(inputFile); // Copy source settings as target settings List <FileEncryptionSettings> writerSettings = reader.FileEncryptionSettings .Select(s => Copy(s)) .ToList(); // Modify a few column settings writerSettings[0] = new FileEncryptionSettings <DateTimeOffset?>(encryptionKey, SqlSerializerFactory.Default.GetDefaultSerializer <DateTimeOffset?>()); writerSettings[3] = new FileEncryptionSettings <string>(encryptionKey, EncryptionType.Deterministic, new SqlVarcharSerializer(size: 255)); writerSettings[10] = new FileEncryptionSettings <double?>(encryptionKey, StandardSerializerFactory.Default.GetDefaultSerializer <double?>()); // Create and pass the target settings to the writer using ParquetFileWriter writer = new ParquetFileWriter(outputFile, writerSettings); // Process the file ColumnarCryptographer cryptographer = new ColumnarCryptographer(reader, writer); cryptographer.Transform(); Console.Clear(); }
public static void TestWriteBatchWithNullOptionalField() { using (var buffer = new ResizableBuffer()) { using (var outStream = new BufferOutputStream(buffer)) using (var writer = new ParquetFileWriter(outStream, new Column[] { new Column <int?>("int32?") })) using (var rowGroupWriter = writer.AppendRowGroup()) using (var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn()) { var defLevels = new short[] { 1, 0, 1 }; var values = new[] { 1, 2 }; colWriter.WriteBatch(defLevels.Length, defLevels, null, values); } using (var inStream = new BufferReader(buffer)) using (var reader = new ParquetFileReader(inStream)) using (var rowGroupReader = reader.RowGroup(0)) using (var colReader = rowGroupReader.Column(0).LogicalReader <int?>()) { var results = new int?[3]; colReader.ReadBatch(results, 0, 3); Assert.AreEqual(new int?[] { 1, null, 2 }, results); } } }
public static void TestBufferOutputStreamFinish() { var expected = Enumerable.Range(0, 100).ToArray(); using var outStream = new BufferOutputStream(); // Write out a single column using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") })) { using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } // Read it back using var buffer = outStream.Finish(); using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <int>(); var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows); Assert.AreEqual(expected, allData); }
public void DecryptParquetFileCorrectly() { using Stream inputFile = File.OpenRead("ResourceFiles\\ciphertext.parquet"); using Stream outputFile = File.OpenWrite($"ResourceFiles\\{nameof(DecryptParquetFileCorrectly)}_out.parquet"); using ParquetFileReader reader = new ParquetFileReader(inputFile); reader.RegisterKeyStoreProviders( new Dictionary <string, EncryptionKeyStoreProvider> { [azureKeyProvider.ProviderName] = azureKeyProvider } ); var writerSettings = reader.FileEncryptionSettings .Select(s => (FileEncryptionSettings)s.Clone()) .ToList(); var targetColumnTypes = reader.FileEncryptionSettings .Select(s => s.GetSerializer().GetGenericType()) .ToList(); writerSettings[0] = Create(targetColumnTypes[0], dataEncryptionKey, EncryptionType.Plaintext, GetSerializer(targetColumnTypes[0])); writerSettings[3] = Create(targetColumnTypes[3], dataEncryptionKey, EncryptionType.Plaintext, GetSerializer(targetColumnTypes[3])); writerSettings[10] = Create(targetColumnTypes[10], dataEncryptionKey, EncryptionType.Plaintext, GetSerializer(targetColumnTypes[10])); using ParquetFileWriter writer = new ParquetFileWriter(outputFile, writerSettings); ColumnarCryptographer cryptographer = new ColumnarCryptographer(reader, writer); cryptographer.Transform(); }
private static void CheckWrittenValues <TValue>(ResizableBuffer buffer, TValue[] expected) { // Read back regular float values. using var input = new BufferReader(buffer); using var fileReader = new ParquetFileReader(input); using var groupReader = fileReader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <TValue>(); var values = columnReader.ReadAll(checked ((int)groupReader.MetaData.NumRows)); Assert.AreEqual(expected, values); }
public static void TestReadNoConverterFactory() { // Test that we cannot read back the values using a custom type without providing a factory. using var buffer = WriteTestValues(Values); using var input = new BufferReader(buffer); using var fileReader = new ParquetFileReader(input); using var groupReader = fileReader.RowGroup(0); var exception = Assert.Throws <NotSupportedException>(() => groupReader.Column(0).LogicalReaderOverride <VolumeInDollars>()); StringAssert.StartsWith("unsupported logical system type", exception?.Message); }
private static void AssertReadRoundtrip(ResizableBuffer buffer, ExpectedColumn[] expectedColumns, bool useDictionaryEncoding) { using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var fileMetaData = fileReader.FileMetaData; var numRows = expectedColumns.First().Values.Length; Assert.AreEqual("parquet-cpp version 1.5.1-SNAPSHOT", fileMetaData.CreatedBy); Assert.AreEqual(new Dictionary <string, string> { { "case", "Test" }, { "Awesome", "true" } }, fileMetaData.KeyValueMetadata); Assert.AreEqual(expectedColumns.Length, fileMetaData.NumColumns); Assert.AreEqual(numRows, fileMetaData.NumRows); Assert.AreEqual(1, fileMetaData.NumRowGroups); Assert.AreEqual(1 + expectedColumns.Length, fileMetaData.NumSchemaElements); Assert.AreEqual(ParquetVersion.PARQUET_1_0, fileMetaData.Version); Assert.AreEqual("parquet-cpp version 1.5.1", fileMetaData.WriterVersion.ToString()); using var rowGroupReader = fileReader.RowGroup(0); var rowGroupMetaData = rowGroupReader.MetaData; for (int c = 0; c != fileMetaData.NumColumns; ++c) { using var columnReader = rowGroupReader.Column(c); var expected = expectedColumns[c]; Console.WriteLine("Reading '{0}'", expected.Name); var descr = columnReader.ColumnDescriptor; var chunkMetaData = rowGroupMetaData.GetColumnChunkMetaData(c); Assert.AreEqual(expected.MaxDefinitionlevel, descr.MaxDefinitionLevel); Assert.AreEqual(expected.MaxRepetitionLevel, descr.MaxRepetitionLevel); Assert.AreEqual(expected.PhysicalType, descr.PhysicalType); Assert.AreEqual(expected.LogicalType, descr.LogicalType); Assert.AreEqual(expected.ColumnOrder, descr.ColumnOrder); Assert.AreEqual(expected.SortOrder, descr.SortOrder); Assert.AreEqual(expected.Name, descr.Name); Assert.AreEqual(expected.TypeLength, descr.TypeLength); Assert.AreEqual(expected.TypePrecision, descr.TypePrecision); Assert.AreEqual(expected.TypeScale, descr.TypeScale); Assert.AreEqual( expected.Encodings.Where(e => useDictionaryEncoding || e != Encoding.PlainDictionary).ToArray(), chunkMetaData.Encodings.Distinct().ToArray()); Assert.AreEqual(expected.Compression, chunkMetaData.Compression); Assert.AreEqual(expected.Values, columnReader.Apply(new PhysicalValueGetter(chunkMetaData.NumValues)).values); } }
public static void TestSkip() { const int numRows = 11; var schemaColumns = new Column[] { new Column <int>("int32_field") }; var values = Enumerable.Range(0, numRows).ToArray(); using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var writer = new ParquetFileWriter(outStream, schemaColumns); using (var rowGroupWriter = writer.AppendRowGroup()) { var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn(); colWriter.WriteBatch(numRows, values); } writer.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroupReader = fileReader.RowGroup(0); // Read back the columns after skipping numRows and make sure the values are what we expect. using (var column = rowGroupReader.Column(0)) { const int numToSkip = 5; var skipped = column.Skip(numToSkip); Assert.AreEqual(numToSkip, skipped); var read = new int[1024]; ((ColumnReader <int>)column).ReadBatch(1024, read, out var numValues); Assert.AreEqual(numValues, numRows - numToSkip); Assert.AreEqual(values.AsSpan(numToSkip).ToArray(), read.AsSpan(0, numRows - numToSkip).ToArray()); } // Check skipped is bound to the maximum number of rows. using (var column = rowGroupReader.Column(0)) { var skipped = column.Skip(1024); Assert.AreEqual(numRows, skipped); Assert.IsFalse(column.HasNext); } }
public static void TestByteStreamSplitEncoding() { const int numRows = 10230; var ids = Enumerable.Range(0, numRows).ToArray(); var values = ids.Select(i => i / 3.14f).ToArray(); using var buffer = new ResizableBuffer(); using (var output = new BufferOutputStream(buffer)) { var columns = new Column[] { new Column <int>("id"), new Column <float>("value") }; var p = new WriterPropertiesBuilder() .Compression(Compression.Lz4) .DisableDictionary("value") .Encoding("value", Encoding.ByteStreamSplit) .Build(); using var fileWriter = new ParquetFileWriter(output, columns, p); using var groupWriter = fileWriter.AppendRowGroup(); using var idWriter = groupWriter.NextColumn().LogicalWriter <int>(); idWriter.WriteBatch(ids); using var valueWriter = groupWriter.NextColumn().LogicalWriter <float>(); valueWriter.WriteBatch(values); fileWriter.Close(); } using var input = new BufferReader(buffer); using var fileReader = new ParquetFileReader(input); using var groupReader = fileReader.RowGroup(0); using var metadataId = groupReader.MetaData.GetColumnChunkMetaData(0); using var metadataValue = groupReader.MetaData.GetColumnChunkMetaData(1); Assert.AreEqual(new[] { Encoding.PlainDictionary, Encoding.Plain, Encoding.Rle }, metadataId.Encodings); Assert.AreEqual(new[] { Encoding.ByteStreamSplit, Encoding.Rle }, metadataValue.Encodings); using var idReader = groupReader.Column(0).LogicalReader <int>(); using var valueReader = groupReader.Column(1).LogicalReader <float>(); Assert.AreEqual(ids, idReader.ReadAll(numRows)); Assert.AreEqual(values, valueReader.ReadAll(numRows)); }
public decimal[] ParquetSharp() { using var fileReader = new ParquetFileReader(Filename); using var groupReader = fileReader.RowGroup(0); using var dateTimeReader = groupReader.Column(0).LogicalReader <decimal>(); var results = dateTimeReader.ReadAll(_values.Length); if (Check.Enabled) { Check.ArraysAreEqual(_values, results); } return(results); }
public void CanReadNestedStructure() { var directory = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location); var path = Path.Combine(directory !, "TestFiles/nested.parquet"); using var fileReader = new ParquetFileReader(path); var rowGroupReader = fileReader.RowGroup(0); // first_level_long var column0Reader = rowGroupReader.Column(0).LogicalReader <long?>(); var column0Actual = column0Reader.ReadAll(2); var column0Expected = new[] { 1, 2 }; Assert.AreEqual(column0Expected, column0Actual); // first_level_nullable_string var column1Reader = rowGroupReader.Column(1).LogicalReader <string?>(); var column1Actual = column1Reader.ReadAll(2); var column1Expected = new[] { null, "Not Null String" }; Assert.AreEqual(column1Expected, column1Actual); // nullable_struct.nullable_struct_string var column2Reader = rowGroupReader.Column(2).LogicalReader <string?>(); var column2Actual = column2Reader.ReadAll(2); var column2Expected = new[] { "Nullable Struct String", null }; Assert.AreEqual(column2Expected, column2Actual); // struct.struct_string var column3Reader = rowGroupReader.Column(3).LogicalReader <string>(); var column3Actual = column3Reader.ReadAll(2); var column3Expected = new[] { "First Struct String", "Second Struct String" }; Assert.AreEqual(column3Expected, column3Actual); // struct_array.array_in_struct_array var column4Reader = rowGroupReader.Column(4).LogicalReader <long?[]?[]>(); var column4Actual = column4Reader.ReadAll(2); var column4Expected = new[] { new[] { new[] { 111, 112, 113 }, new[] { 121, 122, 123 } }, new[] { new[] { 211, 212, 213 } } }; Assert.AreEqual(column4Expected, column4Actual); // struct_array.string_in_struct_array var column5Reader = rowGroupReader.Column(5).LogicalReader <string[]>(); var column5Actual = column5Reader.ReadAll(2); var column5Expected = new[] { new[] { "First String", "Second String" }, new[] { "Third String" } }; Assert.AreEqual(column5Expected, column5Actual); }
public static void TestReadNoTypeFactory() { // Test that we cannot read back the values using a custom type without providing a factory. using var buffer = WriteTestValues(Values); using var input = new BufferReader(buffer); using var fileReader = new ParquetFileReader(input); using var groupReader = fileReader.RowGroup(0); var exception = Assert.Throws <InvalidCastException>(() => { using var reader = groupReader.Column(0).LogicalReader <VolumeInDollars>(); }); StringAssert.StartsWith("Unable to cast object of type 'ParquetSharp.LogicalColumnReader`3[System.Single,System.Single,System.Single]", exception?.Message); }
public static void TestByteBufferOptimisation() { const int numStrings = 100_000; var strings = Enumerable.Range(0, numStrings).Select(i => i.ToString()).ToArray(); var cancel = new CancellationTokenSource(); var task = Task.Run(() => { while (!cancel.IsCancellationRequested) { GC.Collect(); GC.WaitForPendingFinalizers(); Thread.Sleep(1); } }); using (var buffer = new ResizableBuffer()) { using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") }); using (var groupWriter = fileWriter.AppendRowGroup()) { using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>(); // Strings to byte arrays memory pooling is done by the ByteBuffer class. // If something is fishy there (e.g. bad memory ownership wrt the GC), // we expect to see consequences here if we write enough strings. // It's not bullet proof, but it has found a few issues. columnWriter.WriteBatch(strings); } fileWriter.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var groupReader = fileReader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <string>(); Assert.AreEqual(strings, columnReader.ReadAll(numStrings)); } cancel.Cancel(); task.Wait(); }
public static void TestReadingDuplicateStrings([Values(true, false)] bool enableDictionary) { var columns = new Column[] { new Column <DateTime>("dateTime"), new Column <string>("value") }; const int numRows = 10_000; var rand = new Random(1); var dates = Enumerable.Range(0, numRows).Select(i => new DateTime(2020, 01, 01).AddDays(i)).ToArray(); var values = Enumerable.Range(0, numRows).Select(i => (rand.Next(0, 100) * 1000).ToString()).ToArray(); using var buffer = new ResizableBuffer(); // Write a file that contains a lot of duplicate strings. using (var output = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(output, columns, CreateWriterProperties(enableDictionary)); using var groupWriter = fileWriter.AppendRowGroup(); using var dateWriter = groupWriter.NextColumn().LogicalWriter <DateTime>(); dateWriter.WriteBatch(dates); using var valueWrite = groupWriter.NextColumn().LogicalWriter <string>(); valueWrite.WriteBatch(values); } using var input = new BufferReader(buffer); using var fileReader = new ParquetFileReader(input); using var groupReader = fileReader.RowGroup(0); using var dateReader = groupReader.Column(0).LogicalReader <DateTime>(); var readDates = dateReader.ReadAll(numRows); using var valueReader = groupReader.Column(1).LogicalReader <string>(); var readValues = valueReader.ReadAll(numRows); Assert.AreEqual(dates, readDates); Assert.AreEqual(values, readValues); // When reading back the file, we expect the duplicate strings to point to the same memory instances. Assert.That( readValues.Distinct(new StringReferenceComparer()).Count(), enableDictionary ? Is.EqualTo(100) : Is.EqualTo(numRows)); }
public static void TestBigArrayRoundtrip() { // Create a big array of float arrays. Try to detect buffer-size related issues. var m = 8196; var ar = new float[m]; for (var i = 0; i < m; i++) { ar[i] = i; } var n = 4; var expected = new float[n][]; for (var i = 0; i < n; i++) { expected[i] = ar; } using var buffer = new ResizableBuffer(); // Write out a single column using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <float[]>("big_array_field") }); using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <float[]>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } // Read it back. using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <float[]>(); var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows); Assert.AreEqual(expected, allData); }
public static void TestEncryptJustOneColumn() { // Case where the footer is unencrypted and all columns are encrypted all with different keys. using (var buffer = new ResizableBuffer()) { using (var output = new BufferOutputStream(buffer)) { using var fileEncryptionProperties = CreateEncryptJustOneColumnProperties(); WriteParquetFile(output, fileEncryptionProperties); } // Decrypt the whole parquet file with matching decrypt properties. using (var input = new BufferReader(buffer)) { using var fileDecryptionProperties = CreateDecryptWithKeyRetrieverProperties(); ReadParquetFile(fileDecryptionProperties, input, rowGroupMetadata => { using var colMetadata0 = rowGroupMetadata.GetColumnChunkMetaData(0); using var colMetadata1 = rowGroupMetadata.GetColumnChunkMetaData(1); using var crypto0 = colMetadata0.CryptoMetadata; using var crypto1 = colMetadata1.CryptoMetadata; Assert.AreEqual(null, crypto0); Assert.AreEqual("Value", crypto1.ColumnPath.ToDotString()); Assert.AreEqual(false, crypto1.EncryptedWithFooterKey); Assert.AreEqual("Key2", crypto1.KeyMetadata); }); } // Decrypt only the unencrypted column without providing any decrypt properties. using (var input = new BufferReader(buffer)) { using var fileReader = new ParquetFileReader(input); using var groupReader = fileReader.RowGroup(0); var numRows = (int)groupReader.MetaData.NumRows; using (var idReader = groupReader.Column(0).LogicalReader <int>()) { Assert.AreEqual(Ids, idReader.ReadAll(numRows)); } } } }
// Reader tests. private static void TestRead <TCustom, TValue>(TCustom[] expected, TValue[] written) { // Read float values into a custom user-type: // - Provide a converter factory such that float values can be written as VolumeInDollars. // - Explicitly override the expected type when accessing the LogicalColumnReader. using var buffer = WriteTestValues(written); using var input = new BufferReader(buffer); using var fileReader = new ParquetFileReader(input) { LogicalReadConverterFactory = new ReadConverterFactory() }; using var groupReader = fileReader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReaderOverride <TCustom>(); var values = columnReader.ReadAll(checked ((int)groupReader.MetaData.NumRows)); Assert.AreEqual(expected, values); }
public static void TestArrayEdgeCasesRoundtrip() { /* * [None, [], [1.0, None, 2.0]] * [] * None * [[]] */ var expected = new double?[][][] { new double?[][] { null, new double?[] { }, new double?[] { 1.0, null, 2.0 } }, new double?[][] { }, null, new double?[][] { new double?[] { } } }; using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <double?[][]>("a") }); using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <double?[][]>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <double?[][]>(); Assert.AreEqual(4, rowGroup.MetaData.NumRows); var allData = columnReader.ReadAll(4); Assert.AreEqual(expected, allData); }
private static void ReadParquetFile(FileDecryptionProperties fileDecryptionProperties, BufferReader input, Action <RowGroupMetaData> onGroupReader) { using var readerProperties = CreateReaderProperties(fileDecryptionProperties); using var fileReader = new ParquetFileReader(input, readerProperties); using var groupReader = fileReader.RowGroup(0); var metaData = groupReader.MetaData; var numRows = (int)metaData.NumRows; onGroupReader?.Invoke(metaData); using (var idReader = groupReader.Column(0).LogicalReader <int>()) { Assert.AreEqual(Ids, idReader.ReadAll(numRows)); } using (var valueReader = groupReader.Column(1).LogicalReader <float>()) { Assert.AreEqual(Values, valueReader.ReadAll(numRows)); } }
private void ReadParquetFile(ResizableBuffer buffer, MemoryPool pool) { using (var input = new BufferReader(buffer)) using (var fileReader = new ParquetFileReader(input)) { var kvp = fileReader.FileMetaData.KeyValueMetadata; Assert.AreEqual(_keyValueProperties, kvp); using var rowGroupReader = fileReader.RowGroup(0); var numRows = checked ((int)rowGroupReader.MetaData.NumRows); using var dateTimeReader = rowGroupReader.Column(0).LogicalReader <DateTime>(); using var objectIdReader = rowGroupReader.Column(1).LogicalReader <int>(); using var valueReader = rowGroupReader.Column(2).LogicalReader <float>(); dateTimeReader.ReadAll(numRows); objectIdReader.ReadAll(numRows); valueReader.ReadAll(numRows); fileReader.Close(); } }