Esempio n. 1
0
        private void CreateParquetFile(ResizableBuffer buffer)
        {
            using (var output = new BufferOutputStream(buffer))
                using (var fileWriter = new ParquetFileWriter(output, CreateFloatColumns(), keyValueMetadata: _keyValueProperties))
                {
                    using var rowGroupWriter = fileWriter.AppendRowGroup();

                    using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                    {
                        for (int i = 0; i != _dates.Length; ++i)
                        {
                            dateTimeWriter.WriteBatch(Enumerable.Repeat(_dates[i], _objectIds.Length).ToArray());
                        }
                    }

                    using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                    {
                        for (int i = 0; i != _dates.Length; ++i)
                        {
                            objectIdWriter.WriteBatch(_objectIds);
                        }
                    }

                    using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                    {
                        for (int i = 0; i != _dates.Length; ++i)
                        {
                            valueWriter.WriteBatch(_values[i]);
                        }
                    }

                    fileWriter.Close();
                }
        }
Esempio n. 2
0
        public static void TestBufferOutputStreamFinish()
        {
            var expected = Enumerable.Range(0, 100).ToArray();

            using var outStream = new BufferOutputStream();

            // Write out a single column
            using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") }))
            {
                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            // Read it back
            using var buffer       = outStream.Finish();
            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <int>();

            var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);

            Assert.AreEqual(expected, allData);
        }
        public static void TestFileHandleHasBeenReleased()
        {
            var exception = Assert.Throws <InvalidCastException>(() =>
            {
                try
                {
                    using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") }))
                    {
                        using var groupWriter  = writer.AppendRowGroup();
                        using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                        columnWriter.WriteBatch(new[] { 1, 2, 3 });

                        writer.Close();
                    }

                    // Open with the wrong logical reader type on purpose.
                    using var reader       = new ParquetFileReader("file.parquet");
                    using var groupReader  = reader.RowGroup(0);
                    using var columnReader = groupReader.Column(0).LogicalReader <float>();

                    Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3));
                }
                finally
                {
                    // This will throw on Windows if the file handle has not been released.
                    File.Delete("file.parquet");
                }
            });

            StringAssert.StartsWith("Unable to cast object of type", exception?.Message);
        }
        public void DecryptParquetFileCorrectly()
        {
            using Stream inputFile         = File.OpenRead("ResourceFiles\\ciphertext.parquet");
            using Stream outputFile        = File.OpenWrite($"ResourceFiles\\{nameof(DecryptParquetFileCorrectly)}_out.parquet");
            using ParquetFileReader reader = new ParquetFileReader(inputFile);

            reader.RegisterKeyStoreProviders(
                new Dictionary <string, EncryptionKeyStoreProvider> {
                [azureKeyProvider.ProviderName] = azureKeyProvider
            }
                );

            var writerSettings = reader.FileEncryptionSettings
                                 .Select(s => (FileEncryptionSettings)s.Clone())
                                 .ToList();

            var targetColumnTypes = reader.FileEncryptionSettings
                                    .Select(s => s.GetSerializer().GetGenericType())
                                    .ToList();

            writerSettings[0]  = Create(targetColumnTypes[0], dataEncryptionKey, EncryptionType.Plaintext, GetSerializer(targetColumnTypes[0]));
            writerSettings[3]  = Create(targetColumnTypes[3], dataEncryptionKey, EncryptionType.Plaintext, GetSerializer(targetColumnTypes[3]));
            writerSettings[10] = Create(targetColumnTypes[10], dataEncryptionKey, EncryptionType.Plaintext, GetSerializer(targetColumnTypes[10]));

            using ParquetFileWriter writer = new ParquetFileWriter(outputFile, writerSettings);

            ColumnarCryptographer cryptographer = new ColumnarCryptographer(reader, writer);

            cryptographer.Transform();
        }
        public static void TestHasNext()
        {
            const int numRows       = 5;
            var       schemaColumns = new Column[] { new Column <int>("int32_field") };
            var       values        = Enumerable.Range(0, numRows).ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writer         = new ParquetFileWriter(outStream, schemaColumns);
                using var rowGroupWriter = writer.AppendRowGroup();
                using var colWriter      = (ColumnWriter <int>)rowGroupWriter.NextColumn();

                colWriter.WriteBatch(values);

                writer.Close();
            }

            // Read back the columns and make sure they match.
            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var rowGroupReader = fileReader.RowGroup(0);
            using var column         = (ColumnReader <int>)rowGroupReader.Column(0);

            var read = new int[1024];

            column.ReadBatch(1024, read, out var numValues);

            Assert.AreEqual(numValues, numRows);
            Assert.AreEqual(values, read.AsSpan(0, numRows).ToArray());
            Assert.IsFalse(column.HasNext);
        }
Esempio n. 6
0
        public DecimalRead()
        {
            Console.WriteLine("Writing data...");

            var timer = Stopwatch.StartNew();
            var rand  = new Random(123);

            _values = Enumerable.Range(0, 1_000_000).Select(i =>
            {
                var n    = rand.Next();
                var sign = rand.NextDouble() < 0.5 ? -1M : +1M;
                return(sign * ((decimal)n * n * n) / 1000M);
            }).ToArray();

            using (var fileWriter = new ParquetFileWriter(Filename, new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) }))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var valueWriter    = rowGroupWriter.NextColumn().LogicalWriter <decimal>();
                valueWriter.WriteBatch(_values);
                fileWriter.Close();
            }

            Console.WriteLine("Wrote {0:N0} rows in {1:N2} sec", _values.Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
        }
Esempio n. 7
0
 private ParquetRowWriter(ParquetFileWriter parquetFileWriter, WriteAction writeAction)
 {
     _parquetFileWriter = parquetFileWriter;
     _rowGroupWriter    = _parquetFileWriter.AppendRowGroup();
     _writeAction       = writeAction;
     _rows = new TTuple[1024];
 }
Esempio n. 8
0
        public static void TestWriteLongString()
        {
            const int numStrings = 100;

            // Generate lots of digits of 0.1234567891011121131415...
            var strings = Enumerable.Range(0, numStrings).Select(i => "0." + string.Join("", Enumerable.Range(1, 3500).Select(j => j.ToString())) + "...").ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter   = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") });
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>();

                // Strings to byte arrays memory pooling is done by the ByteBuffer class.
                // If something is fishy there (e.g. bad memory ownership wrt the GC),
                // we expect to see consequences here if we write enough strings.
                // It's not bullet proof, but it has found a few issues.
                columnWriter.WriteBatch(strings);

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var groupReader  = fileReader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReader <string>();

            Assert.AreEqual(strings, columnReader.ReadAll(numStrings));
        }
        public static void TestReadExeption()
        {
            var expected = Enumerable.Range(0, 1024 * 1024).ToArray();

            var exception = Assert.Throws <ParquetException>(() =>
            {
                using var buffer = new ErroneousReaderStream();

                using (var output = new ManagedOutputStream(buffer, leaveOpen: true))
                {
                    using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                    using var groupWriter  = writer.AppendRowGroup();
                    using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                    columnWriter.WriteBatch(expected);

                    writer.Close();
                }

                buffer.Seek(0, SeekOrigin.Begin);

                using var input = new ManagedRandomAccessFile(buffer);
                using (new ParquetFileReader(input))
                {
                }
            });

            Assert.That(
                exception.Message,
                Contains.Substring("this is an erroneous reader"));
        }
        private static void TestWriteNoColumnNorWriterOverride <TValue, TCustom>(TValue[] expected, TCustom[] written)
        {
            using var buffer = new ResizableBuffer();

            // Write float values using a custom user-type:
            // - Provide explicit schema definition that knows nothing about VolumeInDollars, and states that it's a float column.
            // - Provide a type factory such that Column("values") is known to be of VolumeInDollars,
            //   as we do not explicitly state the expected type when accessing the LogicalColumnWriter.
            // - Provide a converter factory such that VolumeInDollars values can be written as floats.
            // - Do not explicitly override the expected type when accessing the LogicalColumnWriter.

            using (var output = new BufferOutputStream(buffer))
            {
                using var schema           = Column.CreateSchemaNode(new Column[] { new Column <TValue>("values") });
                using var writerProperties = CreateWriterProperties();
                using var fileWriter       = new ParquetFileWriter(output, schema, writerProperties)
                      {
                          LogicalTypeFactory           = new WriteTypeFactoryNoOverride(),
                          LogicalWriteConverterFactory = new WriteConverterFactory()
                      };
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <TCustom>();

                columnWriter.WriteBatch(written);
                fileWriter.Close();
            }

            CheckWrittenValues(buffer, expected);
        }
Esempio n. 11
0
        public static void TestWriteBatchWithNullOptionalField()
        {
            using (var buffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(buffer))
                    using (var writer = new ParquetFileWriter(outStream, new Column[] { new Column <int?>("int32?") }))
                        using (var rowGroupWriter = writer.AppendRowGroup())
                            using (var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn())
                            {
                                var defLevels = new short[] { 1, 0, 1 };
                                var values    = new[] { 1, 2 };

                                colWriter.WriteBatch(defLevels.Length, defLevels, null, values);
                            }

                using (var inStream = new BufferReader(buffer))
                    using (var reader = new ParquetFileReader(inStream))
                        using (var rowGroupReader = reader.RowGroup(0))
                            using (var colReader = rowGroupReader.Column(0).LogicalReader <int?>())
                            {
                                var results = new int?[3];
                                colReader.ReadBatch(results, 0, 3);

                                Assert.AreEqual(new int?[] { 1, null, 2 }, results);
                            }
            }
        }
        public static void TestFileHandleHasBeenReleased()
        {
            var exception = Assert.Throws <InvalidCastException>(() =>
            {
                try
                {
                    using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") }))
                        using (var group = writer.AppendRowGroup())
                            using (var column = group.NextColumn().LogicalWriter <int>())
                            {
                                column.WriteBatch(new[] { 1, 2, 3 });
                            }

                    // Open with the wrong logical reader type on purpose.
                    using (var reader = new ParquetFileReader("file.parquet"))
                        using (var group = reader.RowGroup(0))
                            using (var column = group.Column(0).LogicalReader <float>())
                            {
                                Assert.AreEqual(new[] { 1, 2, 3 }, column.ReadAll(3));
                            }
                }
                finally
                {
                    // This will throw on Windows if the file handle has not been released.
                    File.Delete("file.parquet");
                }
            });

            Assert.AreEqual(
                "Unable to cast object of type " +
                "'ParquetSharp.LogicalColumnReader`3[System.Int32,System.Int32,System.Int32]'" +
                " to type 'ParquetSharp.LogicalColumnReader`1[System.Single]'.",
                exception.Message);
        }
        private static void TestRoundTrip(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding)
        {
            var schema           = CreateSchema(expectedColumns);
            var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding);
            var keyValueMetadata = new Dictionary <string, string> {
                { "case", "Test" }, { "Awesome", "true" }
            };

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata);
                using var rowGroupWriter = fileWriter.AppendRowGroup();

                foreach (var column in expectedColumns)
                {
                    Console.WriteLine("Writing '{0}'", column.Name);

                    using var columnWriter = rowGroupWriter.NextColumn();
                    columnWriter.Apply(new ValueSetter(column.Values));
                }

                fileWriter.Close();
            }

            // Read back the columns and make sure they match.
            AssertReadRoundtrip(buffer, expectedColumns, useDictionaryEncoding);
        }
Esempio n. 14
0
        public static void TestArrayOfEmptyStringArraysRoundtrip()
        {
            var expected = new[]
            {
                new string[] { },
                new string[] { },
                new string[] { },
                new string[] { }
            };

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string[]>("a") });

                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <string[]>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <string[]>();

            Assert.AreEqual(4, rowGroup.MetaData.NumRows);
            var allData = columnReader.ReadAll(4);

            Assert.AreEqual(expected, allData);
        }
Esempio n. 15
0
        public static void Demo()
        {
            // open input and output file streams
            Stream inputFile  = File.OpenRead(".\\ResourceFiles\\userdata1.parquet");
            Stream outputFile = File.OpenWrite(".\\ResourceFiles\\out1.parquet");

            // Create reader
            using ParquetFileReader reader = new ParquetFileReader(inputFile);

            // Copy source settings as target settings
            List <FileEncryptionSettings> writerSettings = reader.FileEncryptionSettings
                                                           .Select(s => Copy(s))
                                                           .ToList();

            // Modify a few column settings
            writerSettings[0]  = new FileEncryptionSettings <DateTimeOffset?>(encryptionKey, SqlSerializerFactory.Default.GetDefaultSerializer <DateTimeOffset?>());
            writerSettings[3]  = new FileEncryptionSettings <string>(encryptionKey, EncryptionType.Deterministic, new SqlVarcharSerializer(size: 255));
            writerSettings[10] = new FileEncryptionSettings <double?>(encryptionKey, StandardSerializerFactory.Default.GetDefaultSerializer <double?>());

            // Create and pass the target settings to the writer
            using ParquetFileWriter writer = new ParquetFileWriter(outputFile, writerSettings);

            // Process the file
            ColumnarCryptographer cryptographer = new ColumnarCryptographer(reader, writer);

            cryptographer.Transform();

            Console.Clear();
        }
        public static void TestInMemoryRoundTrip()
        {
            var expected = Enumerable.Range(0, 1024 * 1024).ToArray();

            using var buffer = new MemoryStream();

            // Write test data.
            using (var output = new ManagedOutputStream(buffer, leaveOpen: true))
            {
                using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                using var groupWriter  = writer.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                columnWriter.WriteBatch(expected);

                writer.Close();
            }

            // Seek back to start.
            buffer.Seek(0, SeekOrigin.Begin);

            // Read test data.
            using var input        = new ManagedRandomAccessFile(buffer, leaveOpen: true);
            using var reader       = new ParquetFileReader(input);
            using var groupReader  = reader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReader <int>();

            Assert.AreEqual(expected, columnReader.ReadAll(expected.Length));
        }
        public static void TestFileStreamRoundTrip()
        {
            try
            {
                using (var output = new ManagedOutputStream(File.OpenWrite("file.parquet")))
                {
                    using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                    using var groupWriter  = writer.AppendRowGroup();
                    using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                    columnWriter.WriteBatch(new[] { 1, 2, 3 });

                    writer.Close();
                }

                using var input        = new ManagedRandomAccessFile(File.OpenRead("file.parquet"));
                using var reader       = new ParquetFileReader(input);
                using var groupReader  = reader.RowGroup(0);
                using var columnReader = groupReader.Column(0).LogicalReader <int>();

                Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3));
            }
            finally
            {
                File.Delete("file.parquet");
            }
        }
Esempio n. 18
0
        public static unsafe void TestParquetReadFromBuffer()
        {
            var expected = Enumerable.Range(0, 100).ToArray();

            // Write out a single column
            byte[] parquetFileBytes;
            using (var outBuffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(outBuffer))
                    using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") }))
                        using (var rowGroupWriter = fileWriter.AppendRowGroup())
                            using (var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                            {
                                colWriter.WriteBatch(expected);
                            }

                parquetFileBytes = outBuffer.ToArray();
            }

            // Read it back
            fixed(byte *fixedBytes = parquetFileBytes)
            using (var buffer = new IO.Buffer(new IntPtr(fixedBytes), parquetFileBytes.Length))
                using (var inStream = new BufferReader(buffer))
                    using (var fileReader = new ParquetFileReader(inStream))
                        using (var rowGroup = fileReader.RowGroup(0))
                            using (var columnReader = rowGroup.Column(0).LogicalReader <int>())
                            {
                                var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);
                                Assert.AreEqual(expected, allData);
                            }
        }
Esempio n. 19
0
        public static void TestAgainstThirdParty()
        {
            var columns = new Column[] { new Column <decimal>("Decimal", LogicalType.Decimal(precision: 29, scale: 3)) };
            var values  = Enumerable.Range(0, 10_000)
                          .Select(i => ((decimal)i * i * i) / 1000 - 10)
                          .Concat(new [] { decimal.MinValue / 1000, decimal.MaxValue / 1000 })
                          .ToArray();

            using var buffer = new ResizableBuffer();

            // Write using ParquetSharp
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, columns, Compression.Snappy);
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var columnWriter   = rowGroupWriter.NextColumn().LogicalWriter <decimal>();

                columnWriter.WriteBatch(values);

                fileWriter.Close();
            }

            // Read using Parquet.NET
            using var memoryStream   = new MemoryStream(buffer.ToArray());
            using var fileReader     = new ParquetReader(memoryStream);
            using var rowGroupReader = fileReader.OpenRowGroupReader(0);

            var read = (decimal[])rowGroupReader.ReadColumn(fileReader.Schema.GetDataFields()[0]).Data;

            Assert.AreEqual(values, read);
        }
Esempio n. 20
0
        private void ParquetImpl(ParquetFileWriter fileWriter)
        {
            using var rowGroupWriter = fileWriter.AppendRowGroup();

            using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
            {
                for (int i = 0; i != _dates.Length; ++i)
                {
                    dateTimeWriter.WriteBatch(Enumerable.Repeat(_dates[i], _objectIds.Length).ToArray());
                }
            }

            using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
            {
                for (int i = 0; i != _dates.Length; ++i)
                {
                    objectIdWriter.WriteBatch(_objectIds);
                }
            }

            using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
            {
                for (int i = 0; i != _dates.Length; ++i)
                {
                    valueWriter.WriteBatch(_values[i]);
                }
            }

            fileWriter.Close();
        }
Esempio n. 21
0
        public static void TestReadWriteParquetMultipleTasks()
        {
            void WriteFile()
            {
                var schema = new Column[]
                {
                    new Column <DateTime>("Col1"),
                    new Column <int>("Col2"),
                    new Column <float>("Col3")
                };

                const int numRowGroups    = 7;
                const int rowsPerRowGroup = 21;
                var       data            = Enumerable.Range(0, rowsPerRowGroup).ToArray();

                using (var writer1 = new ParquetFileWriter(Task.CurrentId + ".parquet", schema))
                {
                    for (var i = 0; i < numRowGroups; i++)
                    {
                        using var rg1 = writer1.AppendRowGroup();

                        using (var col1Rg1 = rg1.NextColumn().LogicalWriter <DateTime>())
                        {
                            col1Rg1.WriteBatch(data.Select(n => new DateTime(2012, 1, 1).AddDays(n)).ToArray());
                        }

                        using (var col1Rg1 = rg1.NextColumn().LogicalWriter <int>())
                        {
                            col1Rg1.WriteBatch(data);
                        }

                        using (var col1Rg1 = rg1.NextColumn().LogicalWriter <float>())
                        {
                            col1Rg1.WriteBatch(data.Select(n => n + 0.1f).ToArray());
                        }
                    }

                    writer1.Close();
                }

                File.Delete(Task.CurrentId + ".parquet");

                Console.WriteLine(Task.CurrentId + " completed.");
            }

            const int numThreads = 14;
            const int numRuns    = 30000;
            var       running    = new Task[numRuns];

            ThreadPool.SetMaxThreads(numThreads, numThreads);

            foreach (var i in Enumerable.Range(0, numRuns))
            {
                running[i] = Task.Factory.StartNew(WriteFile, CancellationToken.None);
            }

            Task.WaitAll(running);
        }
Esempio n. 22
0
        public long Parquet()
        {
            using (var fileWriter = new ParquetFileWriter("float_timeseries.parquet", CreateFloatColumns()))
            {
                ParquetImpl(fileWriter);
            }

            return(new FileInfo("float_timeseries.parquet").Length);
        }
Esempio n. 23
0
        public long ParquetStream()
        {
            using (var stream = new FileStream("float_timeseries.parquet.stream", FileMode.Create))
            {
                using var writer     = new IO.ManagedOutputStream(stream);
                using var fileWriter = new ParquetFileWriter(writer, CreateFloatColumns());
                ParquetImpl(fileWriter);
            }

            return(new FileInfo("float_timeseries.parquet.stream").Length);
        }
        public static void TestWriteNoTypeFactory()
        {
            // Test that we cannot create a writer using a custom type without providing a factory.
            using var buffer = new ResizableBuffer();
            using var output = new BufferOutputStream(buffer);

            var exception = Assert.Throws <ArgumentException>(() =>
            {
                using var fileWriter = new ParquetFileWriter(output, new Column[] { new Column <VolumeInDollars>("value") });
            });

            StringAssert.StartsWith("unsupported logical type", exception?.Message);
        }
Esempio n. 25
0
        public static void TestDisposeExceptionSafety_ParquetFileWriter()
        {
            var exception = Assert.Throws <Exception>(() =>
            {
                using var buffer     = new ResizableBuffer();
                using var outStream  = new BufferOutputStream(buffer);
                using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("Index"), new Column <float>("Value") });

                throw new Exception("this is the expected message");
            });

            Assert.That(exception.Message, Contains.Substring("this is the expected message"));
        }
Esempio n. 26
0
        public static void TestSkip()
        {
            const int numRows = 11;

            var schemaColumns = new Column[] { new Column <int>("int32_field") };
            var values        = Enumerable.Range(0, numRows).ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writer = new ParquetFileWriter(outStream, schemaColumns);

                using (var rowGroupWriter = writer.AppendRowGroup())
                {
                    var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn();
                    colWriter.WriteBatch(numRows, values);
                }

                writer.Close();
            }

            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var rowGroupReader = fileReader.RowGroup(0);

            // Read back the columns after skipping numRows and make sure the values are what we expect.
            using (var column = rowGroupReader.Column(0))
            {
                const int numToSkip = 5;

                var skipped = column.Skip(numToSkip);

                Assert.AreEqual(numToSkip, skipped);

                var read = new int[1024];
                ((ColumnReader <int>)column).ReadBatch(1024, read, out var numValues);

                Assert.AreEqual(numValues, numRows - numToSkip);
                Assert.AreEqual(values.AsSpan(numToSkip).ToArray(), read.AsSpan(0, numRows - numToSkip).ToArray());
            }

            // Check skipped is bound to the maximum number of rows.
            using (var column = rowGroupReader.Column(0))
            {
                var skipped = column.Skip(1024);

                Assert.AreEqual(numRows, skipped);
                Assert.IsFalse(column.HasNext);
            }
        }
        public static void TestByteStreamSplitEncoding()
        {
            const int numRows = 10230;

            var ids    = Enumerable.Range(0, numRows).ToArray();
            var values = ids.Select(i => i / 3.14f).ToArray();

            using var buffer = new ResizableBuffer();

            using (var output = new BufferOutputStream(buffer))
            {
                var columns = new Column[]
                {
                    new Column <int>("id"),
                    new Column <float>("value")
                };

                var p = new WriterPropertiesBuilder()
                        .Compression(Compression.Lz4)
                        .DisableDictionary("value")
                        .Encoding("value", Encoding.ByteStreamSplit)
                        .Build();

                using var fileWriter  = new ParquetFileWriter(output, columns, p);
                using var groupWriter = fileWriter.AppendRowGroup();

                using var idWriter = groupWriter.NextColumn().LogicalWriter <int>();
                idWriter.WriteBatch(ids);

                using var valueWriter = groupWriter.NextColumn().LogicalWriter <float>();
                valueWriter.WriteBatch(values);

                fileWriter.Close();
            }

            using var input       = new BufferReader(buffer);
            using var fileReader  = new ParquetFileReader(input);
            using var groupReader = fileReader.RowGroup(0);

            using var metadataId    = groupReader.MetaData.GetColumnChunkMetaData(0);
            using var metadataValue = groupReader.MetaData.GetColumnChunkMetaData(1);

            Assert.AreEqual(new[] { Encoding.PlainDictionary, Encoding.Plain, Encoding.Rle }, metadataId.Encodings);
            Assert.AreEqual(new[] { Encoding.ByteStreamSplit, Encoding.Rle }, metadataValue.Encodings);

            using var idReader    = groupReader.Column(0).LogicalReader <int>();
            using var valueReader = groupReader.Column(1).LogicalReader <float>();

            Assert.AreEqual(ids, idReader.ReadAll(numRows));
            Assert.AreEqual(values, valueReader.ReadAll(numRows));
        }
Esempio n. 28
0
        public static void TestDisposedAccess()
        {
            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using var outStream  = new BufferOutputStream(buffer);
            using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("Index") });

            fileWriter.Dispose();

            var exception = Assert.Throws <NullReferenceException>(() => fileWriter.AppendRowGroup());

            Assert.AreEqual("null native handle", exception.Message);
        }
Esempio n. 29
0
        public long ParquetSharp()
        {
            using (var fileWriter = new ParquetFileWriter("decimal_timeseries.parquet", new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) }))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var valueWriter    = rowGroupWriter.NextColumn().LogicalWriter <decimal>();

                valueWriter.WriteBatch(_values);

                fileWriter.Close();
            }

            return(new FileInfo("decimal_timeseries.parquet").Length);
        }
        public static void TestWriteNoConverterFactory()
        {
            // Test that we cannot writer values using a custom type without providing a factory.
            using var buffer           = new ResizableBuffer();
            using var output           = new BufferOutputStream(buffer);
            using var schema           = Column.CreateSchemaNode(new Column[] { new Column <float>("values") });
            using var writerProperties = CreateWriterProperties();
            using var fileWriter       = new ParquetFileWriter(output, schema, writerProperties);
            using var groupWriter      = fileWriter.AppendRowGroup();

            var exception = Assert.Throws <NotSupportedException>(() => groupWriter.NextColumn().LogicalWriterOverride <VolumeInDollars>());

            StringAssert.StartsWith("unsupported logical system type", exception?.Message);
        }