private void CreateParquetFile(ResizableBuffer buffer) { using (var output = new BufferOutputStream(buffer)) using (var fileWriter = new ParquetFileWriter(output, CreateFloatColumns(), keyValueMetadata: _keyValueProperties)) { using var rowGroupWriter = fileWriter.AppendRowGroup(); using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { for (int i = 0; i != _dates.Length; ++i) { dateTimeWriter.WriteBatch(Enumerable.Repeat(_dates[i], _objectIds.Length).ToArray()); } } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { for (int i = 0; i != _dates.Length; ++i) { objectIdWriter.WriteBatch(_objectIds); } } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { for (int i = 0; i != _dates.Length; ++i) { valueWriter.WriteBatch(_values[i]); } } fileWriter.Close(); } }
private static void TestWriteNoColumnNorWriterOverride <TValue, TCustom>(TValue[] expected, TCustom[] written) { using var buffer = new ResizableBuffer(); // Write float values using a custom user-type: // - Provide explicit schema definition that knows nothing about VolumeInDollars, and states that it's a float column. // - Provide a type factory such that Column("values") is known to be of VolumeInDollars, // as we do not explicitly state the expected type when accessing the LogicalColumnWriter. // - Provide a converter factory such that VolumeInDollars values can be written as floats. // - Do not explicitly override the expected type when accessing the LogicalColumnWriter. using (var output = new BufferOutputStream(buffer)) { using var schema = Column.CreateSchemaNode(new Column[] { new Column <TValue>("values") }); using var writerProperties = CreateWriterProperties(); using var fileWriter = new ParquetFileWriter(output, schema, writerProperties) { LogicalTypeFactory = new WriteTypeFactoryNoOverride(), LogicalWriteConverterFactory = new WriteConverterFactory() }; using var groupWriter = fileWriter.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <TCustom>(); columnWriter.WriteBatch(written); fileWriter.Close(); } CheckWrittenValues(buffer, expected); }
public static void TestWriteBatchWithNullOptionalField() { using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var writer = new ParquetFileWriter(outStream, new Column[] { new Column <int?>("int32?") }); using var rowGroupWriter = writer.AppendRowGroup(); using var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn(); var defLevels = new short[] { 1, 0, 1 }; var values = new[] { 1, 2 }; colWriter.WriteBatch(defLevels.Length, defLevels, null, values); writer.Close(); } using var inStream = new BufferReader(buffer); using var reader = new ParquetFileReader(inStream); using var rowGroupReader = reader.RowGroup(0); using var colReader = rowGroupReader.Column(0).LogicalReader <int?>(); var results = new int?[3]; colReader.ReadBatch(results, 0, 3); Assert.AreEqual(new int?[] { 1, null, 2 }, results); }
public static void TestReadExeption() { var expected = Enumerable.Range(0, 1024 * 1024).ToArray(); var exception = Assert.Throws <ParquetException>(() => { using var buffer = new ErroneousReaderStream(); using (var output = new ManagedOutputStream(buffer, leaveOpen: true)) { using var writer = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") }); using var groupWriter = writer.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>(); columnWriter.WriteBatch(expected); writer.Close(); } buffer.Seek(0, SeekOrigin.Begin); using var input = new ManagedRandomAccessFile(buffer); using (new ParquetFileReader(input)) { } }); Assert.That( exception.Message, Contains.Substring("this is an erroneous reader")); }
private static void TestRoundTrip(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding) { var schema = CreateSchema(expectedColumns); var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding); var keyValueMetadata = new Dictionary <string, string> { { "case", "Test" }, { "Awesome", "true" } }; using var buffer = new ResizableBuffer(); // Write our expected columns to the parquet in-memory file. using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata); using var rowGroupWriter = fileWriter.AppendRowGroup(); foreach (var column in expectedColumns) { Console.WriteLine("Writing '{0}'", column.Name); using var columnWriter = rowGroupWriter.NextColumn(); columnWriter.Apply(new ValueSetter(column.Values)); } fileWriter.Close(); } // Read back the columns and make sure they match. AssertReadRoundtrip(buffer, expectedColumns, useDictionaryEncoding); }
public static void TestWriteLongString() { const int numStrings = 100; // Generate lots of digits of 0.1234567891011121131415... var strings = Enumerable.Range(0, numStrings).Select(i => "0." + string.Join("", Enumerable.Range(1, 3500).Select(j => j.ToString())) + "...").ToArray(); using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") }); using var groupWriter = fileWriter.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>(); // Strings to byte arrays memory pooling is done by the ByteBuffer class. // If something is fishy there (e.g. bad memory ownership wrt the GC), // we expect to see consequences here if we write enough strings. // It's not bullet proof, but it has found a few issues. columnWriter.WriteBatch(strings); fileWriter.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var groupReader = fileReader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <string>(); Assert.AreEqual(strings, columnReader.ReadAll(numStrings)); }
public static void TestAgainstThirdParty() { var columns = new Column[] { new Column <decimal>("Decimal", LogicalType.Decimal(precision: 29, scale: 3)) }; var values = Enumerable.Range(0, 10_000) .Select(i => ((decimal)i * i * i) / 1000 - 10) .Concat(new [] { decimal.MinValue / 1000, decimal.MaxValue / 1000 }) .ToArray(); using var buffer = new ResizableBuffer(); // Write using ParquetSharp using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, columns, Compression.Snappy); using var rowGroupWriter = fileWriter.AppendRowGroup(); using var columnWriter = rowGroupWriter.NextColumn().LogicalWriter <decimal>(); columnWriter.WriteBatch(values); fileWriter.Close(); } // Read using Parquet.NET using var memoryStream = new MemoryStream(buffer.ToArray()); using var fileReader = new ParquetReader(memoryStream); using var rowGroupReader = fileReader.OpenRowGroupReader(0); var read = (decimal[])rowGroupReader.ReadColumn(fileReader.Schema.GetDataFields()[0]).Data; Assert.AreEqual(values, read); }
public DecimalRead() { Console.WriteLine("Writing data..."); var timer = Stopwatch.StartNew(); var rand = new Random(123); _values = Enumerable.Range(0, 1_000_000).Select(i => { var n = rand.Next(); var sign = rand.NextDouble() < 0.5 ? -1M : +1M; return(sign * ((decimal)n * n * n) / 1000M); }).ToArray(); using (var fileWriter = new ParquetFileWriter(Filename, new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) })) { using var rowGroupWriter = fileWriter.AppendRowGroup(); using var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <decimal>(); valueWriter.WriteBatch(_values); fileWriter.Close(); } Console.WriteLine("Wrote {0:N0} rows in {1:N2} sec", _values.Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); }
private void ParquetImpl(ParquetFileWriter fileWriter) { using var rowGroupWriter = fileWriter.AppendRowGroup(); using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { for (int i = 0; i != _dates.Length; ++i) { dateTimeWriter.WriteBatch(Enumerable.Repeat(_dates[i], _objectIds.Length).ToArray()); } } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { for (int i = 0; i != _dates.Length; ++i) { objectIdWriter.WriteBatch(_objectIds); } } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { for (int i = 0; i != _dates.Length; ++i) { valueWriter.WriteBatch(_values[i]); } } fileWriter.Close(); }
public static void TestFileHandleHasBeenReleased() { var exception = Assert.Throws <InvalidCastException>(() => { try { using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") })) { using var groupWriter = writer.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>(); columnWriter.WriteBatch(new[] { 1, 2, 3 }); writer.Close(); } // Open with the wrong logical reader type on purpose. using var reader = new ParquetFileReader("file.parquet"); using var groupReader = reader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <float>(); Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3)); } finally { // This will throw on Windows if the file handle has not been released. File.Delete("file.parquet"); } }); StringAssert.StartsWith("Unable to cast object of type", exception?.Message); }
public static void TestArrayOfEmptyStringArraysRoundtrip() { var expected = new[] { new string[] { }, new string[] { }, new string[] { }, new string[] { } }; using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string[]>("a") }); using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <string[]>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <string[]>(); Assert.AreEqual(4, rowGroup.MetaData.NumRows); var allData = columnReader.ReadAll(4); Assert.AreEqual(expected, allData); }
public static void TestBufferOutputStreamFinish() { var expected = Enumerable.Range(0, 100).ToArray(); using var outStream = new BufferOutputStream(); // Write out a single column using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") })) { using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } // Read it back using var buffer = outStream.Finish(); using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <int>(); var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows); Assert.AreEqual(expected, allData); }
public static void TestInMemoryRoundTrip() { var expected = Enumerable.Range(0, 1024 * 1024).ToArray(); using var buffer = new MemoryStream(); // Write test data. using (var output = new ManagedOutputStream(buffer, leaveOpen: true)) { using var writer = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") }); using var groupWriter = writer.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>(); columnWriter.WriteBatch(expected); writer.Close(); } // Seek back to start. buffer.Seek(0, SeekOrigin.Begin); // Read test data. using var input = new ManagedRandomAccessFile(buffer, leaveOpen: true); using var reader = new ParquetFileReader(input); using var groupReader = reader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <int>(); Assert.AreEqual(expected, columnReader.ReadAll(expected.Length)); }
public static void TestFileStreamRoundTrip() { try { using (var output = new ManagedOutputStream(File.OpenWrite("file.parquet"))) { using var writer = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") }); using var groupWriter = writer.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>(); columnWriter.WriteBatch(new[] { 1, 2, 3 }); writer.Close(); } using var input = new ManagedRandomAccessFile(File.OpenRead("file.parquet")); using var reader = new ParquetFileReader(input); using var groupReader = reader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <int>(); Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3)); } finally { File.Delete("file.parquet"); } }
public static void TestHasNext() { const int numRows = 5; var schemaColumns = new Column[] { new Column <int>("int32_field") }; var values = Enumerable.Range(0, numRows).ToArray(); using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var writer = new ParquetFileWriter(outStream, schemaColumns); using var rowGroupWriter = writer.AppendRowGroup(); using var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn(); colWriter.WriteBatch(values); writer.Close(); } // Read back the columns and make sure they match. using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroupReader = fileReader.RowGroup(0); using var column = (ColumnReader <int>)rowGroupReader.Column(0); var read = new int[1024]; column.ReadBatch(1024, read, out var numValues); Assert.AreEqual(numValues, numRows); Assert.AreEqual(values, read.AsSpan(0, numRows).ToArray()); Assert.IsFalse(column.HasNext); }
public static void TestReadWriteParquetMultipleTasks() { void WriteFile() { var schema = new Column[] { new Column <DateTime>("Col1"), new Column <int>("Col2"), new Column <float>("Col3") }; const int numRowGroups = 7; const int rowsPerRowGroup = 21; var data = Enumerable.Range(0, rowsPerRowGroup).ToArray(); using (var writer1 = new ParquetFileWriter(Task.CurrentId + ".parquet", schema)) { for (var i = 0; i < numRowGroups; i++) { using var rg1 = writer1.AppendRowGroup(); using (var col1Rg1 = rg1.NextColumn().LogicalWriter <DateTime>()) { col1Rg1.WriteBatch(data.Select(n => new DateTime(2012, 1, 1).AddDays(n)).ToArray()); } using (var col1Rg1 = rg1.NextColumn().LogicalWriter <int>()) { col1Rg1.WriteBatch(data); } using (var col1Rg1 = rg1.NextColumn().LogicalWriter <float>()) { col1Rg1.WriteBatch(data.Select(n => n + 0.1f).ToArray()); } } writer1.Close(); } File.Delete(Task.CurrentId + ".parquet"); Console.WriteLine(Task.CurrentId + " completed."); } const int numThreads = 14; const int numRuns = 30000; var running = new Task[numRuns]; ThreadPool.SetMaxThreads(numThreads, numThreads); foreach (var i in Enumerable.Range(0, numRuns)) { running[i] = Task.Factory.StartNew(WriteFile, CancellationToken.None); } Task.WaitAll(running); }
public static void TestSkip() { const int numRows = 11; var schemaColumns = new Column[] { new Column <int>("int32_field") }; var values = Enumerable.Range(0, numRows).ToArray(); using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var writer = new ParquetFileWriter(outStream, schemaColumns); using (var rowGroupWriter = writer.AppendRowGroup()) { var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn(); colWriter.WriteBatch(numRows, values); } writer.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroupReader = fileReader.RowGroup(0); // Read back the columns after skipping numRows and make sure the values are what we expect. using (var column = rowGroupReader.Column(0)) { const int numToSkip = 5; var skipped = column.Skip(numToSkip); Assert.AreEqual(numToSkip, skipped); var read = new int[1024]; ((ColumnReader <int>)column).ReadBatch(1024, read, out var numValues); Assert.AreEqual(numValues, numRows - numToSkip); Assert.AreEqual(values.AsSpan(numToSkip).ToArray(), read.AsSpan(0, numRows - numToSkip).ToArray()); } // Check skipped is bound to the maximum number of rows. using (var column = rowGroupReader.Column(0)) { var skipped = column.Skip(1024); Assert.AreEqual(numRows, skipped); Assert.IsFalse(column.HasNext); } }
public static void TestByteStreamSplitEncoding() { const int numRows = 10230; var ids = Enumerable.Range(0, numRows).ToArray(); var values = ids.Select(i => i / 3.14f).ToArray(); using var buffer = new ResizableBuffer(); using (var output = new BufferOutputStream(buffer)) { var columns = new Column[] { new Column <int>("id"), new Column <float>("value") }; var p = new WriterPropertiesBuilder() .Compression(Compression.Lz4) .DisableDictionary("value") .Encoding("value", Encoding.ByteStreamSplit) .Build(); using var fileWriter = new ParquetFileWriter(output, columns, p); using var groupWriter = fileWriter.AppendRowGroup(); using var idWriter = groupWriter.NextColumn().LogicalWriter <int>(); idWriter.WriteBatch(ids); using var valueWriter = groupWriter.NextColumn().LogicalWriter <float>(); valueWriter.WriteBatch(values); fileWriter.Close(); } using var input = new BufferReader(buffer); using var fileReader = new ParquetFileReader(input); using var groupReader = fileReader.RowGroup(0); using var metadataId = groupReader.MetaData.GetColumnChunkMetaData(0); using var metadataValue = groupReader.MetaData.GetColumnChunkMetaData(1); Assert.AreEqual(new[] { Encoding.PlainDictionary, Encoding.Plain, Encoding.Rle }, metadataId.Encodings); Assert.AreEqual(new[] { Encoding.ByteStreamSplit, Encoding.Rle }, metadataValue.Encodings); using var idReader = groupReader.Column(0).LogicalReader <int>(); using var valueReader = groupReader.Column(1).LogicalReader <float>(); Assert.AreEqual(ids, idReader.ReadAll(numRows)); Assert.AreEqual(values, valueReader.ReadAll(numRows)); }
public static void TestDecimalSeries([Values(0, 1)] int warmup) { var timer = Stopwatch.StartNew(); var rand = new Random(123); Console.WriteLine("Generating data..."); var values = Enumerable.Range(0, 10_000_000).Select(i => { var n = rand.Next(); var sign = rand.NextDouble() < 0.5 ? -1M : +1M; return(sign * ((decimal)n * n * n) / 1000M); }).ToArray(); Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", values.Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet"); timer.Restart(); using (var fileWriter = new ParquetFileWriter("decimal_timeseries.parquet", new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) })) { using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <decimal>(); valueWriter.WriteBatch(values); } fileWriter.Close(); } Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", new FileInfo("decimal_timeseries.parquet").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.NET"); timer.Restart(); { var valueField = new DecimalDataField("Value", precision: 29, scale: 3); var schema = new Parquet.Data.Schema(valueField); using var stream = File.Create("decimal_timeseries.parquet.net"); using var parquetWriter = new ParquetWriter(schema, stream); using var groupWriter = parquetWriter.CreateRowGroup(); groupWriter.WriteColumn(new DataColumn(valueField, values)); } Console.WriteLine("Saved to Parquet.NET ({0:N0} bytes) in {1:N2} sec", new FileInfo("decimal_timeseries.parquet.net").Length, timer.Elapsed.TotalSeconds); }
public FloatTimeSeriesRead() { Console.WriteLine("Writing data..."); var timer = Stopwatch.StartNew(); DateTime[] dates; int[] objectIds; float[][] values; (dates, objectIds, values, _numRows) = CreateFloatDataFrame(3600); _allDates = dates.SelectMany(d => Enumerable.Repeat(d, objectIds.Length)).ToArray(); _allDatesAsDateTimeOffsets = dates.SelectMany(d => Enumerable.Repeat(new DateTimeOffset(d, TimeSpan.Zero), objectIds.Length)).ToArray(); _allObjectIds = dates.SelectMany(d => objectIds).ToArray(); _allValues = dates.SelectMany((d, i) => values[i]).ToArray(); using (var fileWriter = new ParquetFileWriter(Filename, CreateFloatColumns(), Compression.Snappy)) { using var rowGroupWriter = fileWriter.AppendRowGroup(); using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { for (int i = 0; i != dates.Length; ++i) { dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray()); } } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { for (int i = 0; i != dates.Length; ++i) { objectIdWriter.WriteBatch(objectIds); } } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { for (int i = 0; i != dates.Length; ++i) { valueWriter.WriteBatch(values[i]); } } fileWriter.Close(); } Console.WriteLine("Wrote {0:N0} rows in {1:N2} sec", _numRows, timer.Elapsed.TotalSeconds); Console.WriteLine(); }
public long ParquetSharp() { using (var fileWriter = new ParquetFileWriter("decimal_timeseries.parquet", new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) })) { using var rowGroupWriter = fileWriter.AppendRowGroup(); using var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <decimal>(); valueWriter.WriteBatch(_values); fileWriter.Close(); } return(new FileInfo("decimal_timeseries.parquet").Length); }
public static void TestByteBufferOptimisation() { const int numStrings = 100_000; var strings = Enumerable.Range(0, numStrings).Select(i => i.ToString()).ToArray(); var cancel = new CancellationTokenSource(); var task = Task.Run(() => { while (!cancel.IsCancellationRequested) { GC.Collect(); GC.WaitForPendingFinalizers(); Thread.Sleep(1); } }); using (var buffer = new ResizableBuffer()) { using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") }); using (var groupWriter = fileWriter.AppendRowGroup()) { using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>(); // Strings to byte arrays memory pooling is done by the ByteBuffer class. // If something is fishy there (e.g. bad memory ownership wrt the GC), // we expect to see consequences here if we write enough strings. // It's not bullet proof, but it has found a few issues. columnWriter.WriteBatch(strings); } fileWriter.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var groupReader = fileReader.RowGroup(0); using var columnReader = groupReader.Column(0).LogicalReader <string>(); Assert.AreEqual(strings, columnReader.ReadAll(numStrings)); } cancel.Cancel(); task.Wait(); }
public static void TestRoundTripBuffered( // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0 [Values(2, 8, 32, 128)] int rowsPerBatch, [Values(7, 49, 343, 2401)] int writeBufferLength, [Values(11, 121, 1331)] int readBufferLength, [Values(true, false)] bool useDictionaryEncoding ) { var expectedColumns = CreateExpectedColumns(); var schemaColumns = expectedColumns .Select(c => new Column(c.Values.GetType().GetElementType() ?? throw new InvalidOperationException(), c.Name, c.LogicalTypeOverride)) .ToArray(); using var buffer = new ResizableBuffer(); // Write our expected columns to the parquet in-memory file. using (var outStream = new BufferOutputStream(buffer)) { using var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding); using var fileWriter = new ParquetFileWriter(outStream, schemaColumns, writerProperties); using var rowGroupWriter = fileWriter.AppendBufferedRowGroup(); const int rangeLength = 9; for (int r = 0; r < NumRows; r += rangeLength) { for (var i = 0; i < expectedColumns.Length; i++) { var column = expectedColumns[i]; var range = (r, Math.Min(r + rangeLength, NumRows)); Console.WriteLine("Writing '{0}' (element type: {1}) (range: {2})", column.Name, column.Values.GetType().GetElementType(), range); using var columnWriter = rowGroupWriter.Column(i).LogicalWriter(writeBufferLength); columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch, range)); } } fileWriter.Close(); } Console.WriteLine(); // Read back the columns and make sure they match. AssertReadRoundtrip(rowsPerBatch, readBufferLength, buffer, expectedColumns); }
public static void TestBigArrayRoundtrip() { // Create a big array of float arrays. Try to detect buffer-size related issues. var m = 8196; var ar = new float[m]; for (var i = 0; i < m; i++) { ar[i] = i; } var n = 4; var expected = new float[n][]; for (var i = 0; i < n; i++) { expected[i] = ar; } using var buffer = new ResizableBuffer(); // Write out a single column using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <float[]>("big_array_field") }); using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <float[]>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } // Read it back. using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <float[]>(); var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows); Assert.AreEqual(expected, allData); }
private static void TestRoundTripBuffered(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding) { // Same as the default round-trip test, but use buffered row groups. var schema = CreateSchema(expectedColumns); var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding); var keyValueMetadata = new Dictionary <string, string> { { "case", "Test" }, { "Awesome", "true" } }; using var buffer = new ResizableBuffer(); // Write our expected columns to the parquet in-memory file. using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata); using var rowGroupWriter = fileWriter.AppendBufferedRowGroup(); const int rangeLength = 9; var numRows = expectedColumns.First().Values.Length; for (int r = 0; r < numRows; r += rangeLength) { for (var i = 0; i < expectedColumns.Length; i++) { var column = expectedColumns[i]; var range = (r, Math.Min(r + rangeLength, numRows)); if (range.Item1 == 0 || range.Item2 == numRows) { Console.WriteLine("Writing '{0}' (range: {1})", column.Name, range); } using var columnWriter = rowGroupWriter.Column(i); columnWriter.Apply(new ValueSetter(column.Values, range)); } } fileWriter.Close(); } // Read back the columns and make sure they match. AssertReadRoundtrip(buffer, expectedColumns, useDictionaryEncoding); }
public static void TestArrayEdgeCasesRoundtrip() { /* * [None, [], [1.0, None, 2.0]] * [] * None * [[]] */ var expected = new double?[][][] { new double?[][] { null, new double?[] { }, new double?[] { 1.0, null, 2.0 } }, new double?[][] { }, null, new double?[][] { new double?[] { } } }; using var buffer = new ResizableBuffer(); using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <double?[][]>("a") }); using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <double?[][]>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <double?[][]>(); Assert.AreEqual(4, rowGroup.MetaData.NumRows); var allData = columnReader.ReadAll(4); Assert.AreEqual(expected, allData); }
public static unsafe void TestParquetReadFromBuffer() { var expected = Enumerable.Range(0, 100).ToArray(); // Write out a single column byte[] parquetFileBytes; using (var outBuffer = new ResizableBuffer()) { using (var outStream = new BufferOutputStream(outBuffer)) { using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") }); using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>(); colWriter.WriteBatch(expected); } fileWriter.Close(); } parquetFileBytes = outBuffer.ToArray(); } // Read it back fixed(byte *fixedBytes = parquetFileBytes) { using var buffer = new IO.Buffer(new IntPtr(fixedBytes), parquetFileBytes.Length); using var inStream = new BufferReader(buffer); using var fileReader = new ParquetFileReader(inStream); using var rowGroup = fileReader.RowGroup(0); using var columnReader = rowGroup.Column(0).LogicalReader <int>(); var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows); Assert.AreEqual(expected, allData); } }
private static ResizableBuffer WriteTestValues <TValue>(TValue[] written) { var buffer = new ResizableBuffer(); try { using var output = new BufferOutputStream(buffer); using var fileWriter = new ParquetFileWriter(output, new Column[] { new Column <TValue>("values") }); using var groupWriter = fileWriter.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <TValue>(); columnWriter.WriteBatch(written); fileWriter.Close(); return(buffer); } catch { buffer.Dispose(); throw; } }
public static void TestRoundTrip( // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0 [Values(2, 8, 32, 128)] int rowsPerBatch, [Values(7, 49, 343, 2401)] int writeBufferLength, [Values(11, 121, 1331)] int readBufferLength, [Values(true, false)] bool useDictionaryEncoding ) { var expectedColumns = CreateExpectedColumns(); var schemaColumns = expectedColumns.Select(c => new Column(c.Values.GetType().GetElementType(), c.Name, c.LogicalTypeOverride)).ToArray(); using var buffer = new ResizableBuffer(); // Write our expected columns to the parquet in-memory file. using (var outStream = new BufferOutputStream(buffer)) { using var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding); using var fileWriter = new ParquetFileWriter(outStream, schemaColumns, writerProperties); using var rowGroupWriter = fileWriter.AppendRowGroup(); foreach (var column in expectedColumns) { Console.WriteLine("Writing '{0}' ({1})", column.Name, column.Values.GetType().GetElementType()); using var columnWriter = rowGroupWriter.NextColumn().LogicalWriter(writeBufferLength); columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch)); } fileWriter.Close(); } Console.WriteLine(); // Read back the columns and make sure they match. AssertReadRoundtrip(rowsPerBatch, readBufferLength, buffer, expectedColumns); }
private static void TestWriteNoWriterOverride <TValue, TCustom>(TValue[] expected, TCustom[] written) { using var buffer = new ResizableBuffer(); // Write float values using a custom user-type: // - Provide a type factory such that Column<VolumeInDollars> can be converted to the right schema node. // - Provide a converter factory such that VolumeInDollars values can be written as floats. // - Do not explicitly override the expected type when accessing the LogicalColumnWriter. using (var output = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(output, new Column[] { new Column <TCustom>("values") }, new WriteTypeFactory()) { LogicalWriteConverterFactory = new WriteConverterFactory() }; using var groupWriter = fileWriter.AppendRowGroup(); using var columnWriter = groupWriter.NextColumn().LogicalWriter <TCustom>(); columnWriter.WriteBatch(written); fileWriter.Close(); } CheckWrittenValues(buffer, expected); }