Example #1
0
        private void CreateParquetFile(ResizableBuffer buffer)
        {
            using (var output = new BufferOutputStream(buffer))
                using (var fileWriter = new ParquetFileWriter(output, CreateFloatColumns(), keyValueMetadata: _keyValueProperties))
                {
                    using var rowGroupWriter = fileWriter.AppendRowGroup();

                    using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                    {
                        for (int i = 0; i != _dates.Length; ++i)
                        {
                            dateTimeWriter.WriteBatch(Enumerable.Repeat(_dates[i], _objectIds.Length).ToArray());
                        }
                    }

                    using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                    {
                        for (int i = 0; i != _dates.Length; ++i)
                        {
                            objectIdWriter.WriteBatch(_objectIds);
                        }
                    }

                    using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                    {
                        for (int i = 0; i != _dates.Length; ++i)
                        {
                            valueWriter.WriteBatch(_values[i]);
                        }
                    }

                    fileWriter.Close();
                }
        }
        private static void TestWriteNoColumnNorWriterOverride <TValue, TCustom>(TValue[] expected, TCustom[] written)
        {
            using var buffer = new ResizableBuffer();

            // Write float values using a custom user-type:
            // - Provide explicit schema definition that knows nothing about VolumeInDollars, and states that it's a float column.
            // - Provide a type factory such that Column("values") is known to be of VolumeInDollars,
            //   as we do not explicitly state the expected type when accessing the LogicalColumnWriter.
            // - Provide a converter factory such that VolumeInDollars values can be written as floats.
            // - Do not explicitly override the expected type when accessing the LogicalColumnWriter.

            using (var output = new BufferOutputStream(buffer))
            {
                using var schema           = Column.CreateSchemaNode(new Column[] { new Column <TValue>("values") });
                using var writerProperties = CreateWriterProperties();
                using var fileWriter       = new ParquetFileWriter(output, schema, writerProperties)
                      {
                          LogicalTypeFactory           = new WriteTypeFactoryNoOverride(),
                          LogicalWriteConverterFactory = new WriteConverterFactory()
                      };
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <TCustom>();

                columnWriter.WriteBatch(written);
                fileWriter.Close();
            }

            CheckWrittenValues(buffer, expected);
        }
Example #3
0
        public static void TestWriteBatchWithNullOptionalField()
        {
            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writer         = new ParquetFileWriter(outStream, new Column[] { new Column <int?>("int32?") });
                using var rowGroupWriter = writer.AppendRowGroup();
                using var colWriter      = (ColumnWriter <int>)rowGroupWriter.NextColumn();

                var defLevels = new short[] { 1, 0, 1 };
                var values    = new[] { 1, 2 };

                colWriter.WriteBatch(defLevels.Length, defLevels, null, values);

                writer.Close();
            }

            using var inStream       = new BufferReader(buffer);
            using var reader         = new ParquetFileReader(inStream);
            using var rowGroupReader = reader.RowGroup(0);
            using var colReader      = rowGroupReader.Column(0).LogicalReader <int?>();

            var results = new int?[3];

            colReader.ReadBatch(results, 0, 3);

            Assert.AreEqual(new int?[] { 1, null, 2 }, results);
        }
        public static void TestReadExeption()
        {
            var expected = Enumerable.Range(0, 1024 * 1024).ToArray();

            var exception = Assert.Throws <ParquetException>(() =>
            {
                using var buffer = new ErroneousReaderStream();

                using (var output = new ManagedOutputStream(buffer, leaveOpen: true))
                {
                    using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                    using var groupWriter  = writer.AppendRowGroup();
                    using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                    columnWriter.WriteBatch(expected);

                    writer.Close();
                }

                buffer.Seek(0, SeekOrigin.Begin);

                using var input = new ManagedRandomAccessFile(buffer);
                using (new ParquetFileReader(input))
                {
                }
            });

            Assert.That(
                exception.Message,
                Contains.Substring("this is an erroneous reader"));
        }
        private static void TestRoundTrip(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding)
        {
            var schema           = CreateSchema(expectedColumns);
            var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding);
            var keyValueMetadata = new Dictionary <string, string> {
                { "case", "Test" }, { "Awesome", "true" }
            };

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata);
                using var rowGroupWriter = fileWriter.AppendRowGroup();

                foreach (var column in expectedColumns)
                {
                    Console.WriteLine("Writing '{0}'", column.Name);

                    using var columnWriter = rowGroupWriter.NextColumn();
                    columnWriter.Apply(new ValueSetter(column.Values));
                }

                fileWriter.Close();
            }

            // Read back the columns and make sure they match.
            AssertReadRoundtrip(buffer, expectedColumns, useDictionaryEncoding);
        }
Example #6
0
        public static void TestWriteLongString()
        {
            const int numStrings = 100;

            // Generate lots of digits of 0.1234567891011121131415...
            var strings = Enumerable.Range(0, numStrings).Select(i => "0." + string.Join("", Enumerable.Range(1, 3500).Select(j => j.ToString())) + "...").ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter   = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") });
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>();

                // Strings to byte arrays memory pooling is done by the ByteBuffer class.
                // If something is fishy there (e.g. bad memory ownership wrt the GC),
                // we expect to see consequences here if we write enough strings.
                // It's not bullet proof, but it has found a few issues.
                columnWriter.WriteBatch(strings);

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var groupReader  = fileReader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReader <string>();

            Assert.AreEqual(strings, columnReader.ReadAll(numStrings));
        }
Example #7
0
        public static void TestAgainstThirdParty()
        {
            var columns = new Column[] { new Column <decimal>("Decimal", LogicalType.Decimal(precision: 29, scale: 3)) };
            var values  = Enumerable.Range(0, 10_000)
                          .Select(i => ((decimal)i * i * i) / 1000 - 10)
                          .Concat(new [] { decimal.MinValue / 1000, decimal.MaxValue / 1000 })
                          .ToArray();

            using var buffer = new ResizableBuffer();

            // Write using ParquetSharp
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, columns, Compression.Snappy);
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var columnWriter   = rowGroupWriter.NextColumn().LogicalWriter <decimal>();

                columnWriter.WriteBatch(values);

                fileWriter.Close();
            }

            // Read using Parquet.NET
            using var memoryStream   = new MemoryStream(buffer.ToArray());
            using var fileReader     = new ParquetReader(memoryStream);
            using var rowGroupReader = fileReader.OpenRowGroupReader(0);

            var read = (decimal[])rowGroupReader.ReadColumn(fileReader.Schema.GetDataFields()[0]).Data;

            Assert.AreEqual(values, read);
        }
Example #8
0
        public DecimalRead()
        {
            Console.WriteLine("Writing data...");

            var timer = Stopwatch.StartNew();
            var rand  = new Random(123);

            _values = Enumerable.Range(0, 1_000_000).Select(i =>
            {
                var n    = rand.Next();
                var sign = rand.NextDouble() < 0.5 ? -1M : +1M;
                return(sign * ((decimal)n * n * n) / 1000M);
            }).ToArray();

            using (var fileWriter = new ParquetFileWriter(Filename, new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) }))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var valueWriter    = rowGroupWriter.NextColumn().LogicalWriter <decimal>();
                valueWriter.WriteBatch(_values);
                fileWriter.Close();
            }

            Console.WriteLine("Wrote {0:N0} rows in {1:N2} sec", _values.Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
        }
        private void ParquetImpl(ParquetFileWriter fileWriter)
        {
            using var rowGroupWriter = fileWriter.AppendRowGroup();

            using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
            {
                for (int i = 0; i != _dates.Length; ++i)
                {
                    dateTimeWriter.WriteBatch(Enumerable.Repeat(_dates[i], _objectIds.Length).ToArray());
                }
            }

            using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
            {
                for (int i = 0; i != _dates.Length; ++i)
                {
                    objectIdWriter.WriteBatch(_objectIds);
                }
            }

            using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
            {
                for (int i = 0; i != _dates.Length; ++i)
                {
                    valueWriter.WriteBatch(_values[i]);
                }
            }

            fileWriter.Close();
        }
        public static void TestFileHandleHasBeenReleased()
        {
            var exception = Assert.Throws <InvalidCastException>(() =>
            {
                try
                {
                    using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") }))
                    {
                        using var groupWriter  = writer.AppendRowGroup();
                        using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                        columnWriter.WriteBatch(new[] { 1, 2, 3 });

                        writer.Close();
                    }

                    // Open with the wrong logical reader type on purpose.
                    using var reader       = new ParquetFileReader("file.parquet");
                    using var groupReader  = reader.RowGroup(0);
                    using var columnReader = groupReader.Column(0).LogicalReader <float>();

                    Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3));
                }
                finally
                {
                    // This will throw on Windows if the file handle has not been released.
                    File.Delete("file.parquet");
                }
            });

            StringAssert.StartsWith("Unable to cast object of type", exception?.Message);
        }
Example #11
0
        public static void TestArrayOfEmptyStringArraysRoundtrip()
        {
            var expected = new[]
            {
                new string[] { },
                new string[] { },
                new string[] { },
                new string[] { }
            };

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string[]>("a") });

                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <string[]>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <string[]>();

            Assert.AreEqual(4, rowGroup.MetaData.NumRows);
            var allData = columnReader.ReadAll(4);

            Assert.AreEqual(expected, allData);
        }
Example #12
0
        public static void TestBufferOutputStreamFinish()
        {
            var expected = Enumerable.Range(0, 100).ToArray();

            using var outStream = new BufferOutputStream();

            // Write out a single column
            using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") }))
            {
                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            // Read it back
            using var buffer       = outStream.Finish();
            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <int>();

            var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);

            Assert.AreEqual(expected, allData);
        }
        public static void TestInMemoryRoundTrip()
        {
            var expected = Enumerable.Range(0, 1024 * 1024).ToArray();

            using var buffer = new MemoryStream();

            // Write test data.
            using (var output = new ManagedOutputStream(buffer, leaveOpen: true))
            {
                using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                using var groupWriter  = writer.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                columnWriter.WriteBatch(expected);

                writer.Close();
            }

            // Seek back to start.
            buffer.Seek(0, SeekOrigin.Begin);

            // Read test data.
            using var input        = new ManagedRandomAccessFile(buffer, leaveOpen: true);
            using var reader       = new ParquetFileReader(input);
            using var groupReader  = reader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReader <int>();

            Assert.AreEqual(expected, columnReader.ReadAll(expected.Length));
        }
        public static void TestFileStreamRoundTrip()
        {
            try
            {
                using (var output = new ManagedOutputStream(File.OpenWrite("file.parquet")))
                {
                    using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                    using var groupWriter  = writer.AppendRowGroup();
                    using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                    columnWriter.WriteBatch(new[] { 1, 2, 3 });

                    writer.Close();
                }

                using var input        = new ManagedRandomAccessFile(File.OpenRead("file.parquet"));
                using var reader       = new ParquetFileReader(input);
                using var groupReader  = reader.RowGroup(0);
                using var columnReader = groupReader.Column(0).LogicalReader <int>();

                Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3));
            }
            finally
            {
                File.Delete("file.parquet");
            }
        }
        public static void TestHasNext()
        {
            const int numRows       = 5;
            var       schemaColumns = new Column[] { new Column <int>("int32_field") };
            var       values        = Enumerable.Range(0, numRows).ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writer         = new ParquetFileWriter(outStream, schemaColumns);
                using var rowGroupWriter = writer.AppendRowGroup();
                using var colWriter      = (ColumnWriter <int>)rowGroupWriter.NextColumn();

                colWriter.WriteBatch(values);

                writer.Close();
            }

            // Read back the columns and make sure they match.
            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var rowGroupReader = fileReader.RowGroup(0);
            using var column         = (ColumnReader <int>)rowGroupReader.Column(0);

            var read = new int[1024];

            column.ReadBatch(1024, read, out var numValues);

            Assert.AreEqual(numValues, numRows);
            Assert.AreEqual(values, read.AsSpan(0, numRows).ToArray());
            Assert.IsFalse(column.HasNext);
        }
Example #16
0
        public static void TestReadWriteParquetMultipleTasks()
        {
            void WriteFile()
            {
                var schema = new Column[]
                {
                    new Column <DateTime>("Col1"),
                    new Column <int>("Col2"),
                    new Column <float>("Col3")
                };

                const int numRowGroups    = 7;
                const int rowsPerRowGroup = 21;
                var       data            = Enumerable.Range(0, rowsPerRowGroup).ToArray();

                using (var writer1 = new ParquetFileWriter(Task.CurrentId + ".parquet", schema))
                {
                    for (var i = 0; i < numRowGroups; i++)
                    {
                        using var rg1 = writer1.AppendRowGroup();

                        using (var col1Rg1 = rg1.NextColumn().LogicalWriter <DateTime>())
                        {
                            col1Rg1.WriteBatch(data.Select(n => new DateTime(2012, 1, 1).AddDays(n)).ToArray());
                        }

                        using (var col1Rg1 = rg1.NextColumn().LogicalWriter <int>())
                        {
                            col1Rg1.WriteBatch(data);
                        }

                        using (var col1Rg1 = rg1.NextColumn().LogicalWriter <float>())
                        {
                            col1Rg1.WriteBatch(data.Select(n => n + 0.1f).ToArray());
                        }
                    }

                    writer1.Close();
                }

                File.Delete(Task.CurrentId + ".parquet");

                Console.WriteLine(Task.CurrentId + " completed.");
            }

            const int numThreads = 14;
            const int numRuns    = 30000;
            var       running    = new Task[numRuns];

            ThreadPool.SetMaxThreads(numThreads, numThreads);

            foreach (var i in Enumerable.Range(0, numRuns))
            {
                running[i] = Task.Factory.StartNew(WriteFile, CancellationToken.None);
            }

            Task.WaitAll(running);
        }
        public static void TestSkip()
        {
            const int numRows = 11;

            var schemaColumns = new Column[] { new Column <int>("int32_field") };
            var values        = Enumerable.Range(0, numRows).ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writer = new ParquetFileWriter(outStream, schemaColumns);

                using (var rowGroupWriter = writer.AppendRowGroup())
                {
                    var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn();
                    colWriter.WriteBatch(numRows, values);
                }

                writer.Close();
            }

            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var rowGroupReader = fileReader.RowGroup(0);

            // Read back the columns after skipping numRows and make sure the values are what we expect.
            using (var column = rowGroupReader.Column(0))
            {
                const int numToSkip = 5;

                var skipped = column.Skip(numToSkip);

                Assert.AreEqual(numToSkip, skipped);

                var read = new int[1024];
                ((ColumnReader <int>)column).ReadBatch(1024, read, out var numValues);

                Assert.AreEqual(numValues, numRows - numToSkip);
                Assert.AreEqual(values.AsSpan(numToSkip).ToArray(), read.AsSpan(0, numRows - numToSkip).ToArray());
            }

            // Check skipped is bound to the maximum number of rows.
            using (var column = rowGroupReader.Column(0))
            {
                var skipped = column.Skip(1024);

                Assert.AreEqual(numRows, skipped);
                Assert.IsFalse(column.HasNext);
            }
        }
        public static void TestByteStreamSplitEncoding()
        {
            const int numRows = 10230;

            var ids    = Enumerable.Range(0, numRows).ToArray();
            var values = ids.Select(i => i / 3.14f).ToArray();

            using var buffer = new ResizableBuffer();

            using (var output = new BufferOutputStream(buffer))
            {
                var columns = new Column[]
                {
                    new Column <int>("id"),
                    new Column <float>("value")
                };

                var p = new WriterPropertiesBuilder()
                        .Compression(Compression.Lz4)
                        .DisableDictionary("value")
                        .Encoding("value", Encoding.ByteStreamSplit)
                        .Build();

                using var fileWriter  = new ParquetFileWriter(output, columns, p);
                using var groupWriter = fileWriter.AppendRowGroup();

                using var idWriter = groupWriter.NextColumn().LogicalWriter <int>();
                idWriter.WriteBatch(ids);

                using var valueWriter = groupWriter.NextColumn().LogicalWriter <float>();
                valueWriter.WriteBatch(values);

                fileWriter.Close();
            }

            using var input       = new BufferReader(buffer);
            using var fileReader  = new ParquetFileReader(input);
            using var groupReader = fileReader.RowGroup(0);

            using var metadataId    = groupReader.MetaData.GetColumnChunkMetaData(0);
            using var metadataValue = groupReader.MetaData.GetColumnChunkMetaData(1);

            Assert.AreEqual(new[] { Encoding.PlainDictionary, Encoding.Plain, Encoding.Rle }, metadataId.Encodings);
            Assert.AreEqual(new[] { Encoding.ByteStreamSplit, Encoding.Rle }, metadataValue.Encodings);

            using var idReader    = groupReader.Column(0).LogicalReader <int>();
            using var valueReader = groupReader.Column(1).LogicalReader <float>();

            Assert.AreEqual(ids, idReader.ReadAll(numRows));
            Assert.AreEqual(values, valueReader.ReadAll(numRows));
        }
        public static void TestDecimalSeries([Values(0, 1)] int warmup)
        {
            var timer = Stopwatch.StartNew();
            var rand  = new Random(123);

            Console.WriteLine("Generating data...");

            var values = Enumerable.Range(0, 10_000_000).Select(i =>
            {
                var n    = rand.Next();
                var sign = rand.NextDouble() < 0.5 ? -1M : +1M;
                return(sign * ((decimal)n * n * n) / 1000M);
            }).ToArray();

            Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", values.Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet");

            timer.Restart();

            using (var fileWriter = new ParquetFileWriter("decimal_timeseries.parquet", new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) }))
            {
                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <decimal>();
                    valueWriter.WriteBatch(values);
                }

                fileWriter.Close();
            }

            Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", new FileInfo("decimal_timeseries.parquet").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.NET");

            timer.Restart();

            {
                var valueField = new DecimalDataField("Value", precision: 29, scale: 3);
                var schema     = new Parquet.Data.Schema(valueField);

                using var stream        = File.Create("decimal_timeseries.parquet.net");
                using var parquetWriter = new ParquetWriter(schema, stream);
                using var groupWriter   = parquetWriter.CreateRowGroup();

                groupWriter.WriteColumn(new DataColumn(valueField, values));
            }

            Console.WriteLine("Saved to Parquet.NET ({0:N0} bytes) in {1:N2} sec", new FileInfo("decimal_timeseries.parquet.net").Length, timer.Elapsed.TotalSeconds);
        }
Example #20
0
        public FloatTimeSeriesRead()
        {
            Console.WriteLine("Writing data...");

            var timer = Stopwatch.StartNew();

            DateTime[] dates;
            int[]      objectIds;
            float[][]  values;
            (dates, objectIds, values, _numRows) = CreateFloatDataFrame(3600);

            _allDates = dates.SelectMany(d => Enumerable.Repeat(d, objectIds.Length)).ToArray();
            _allDatesAsDateTimeOffsets = dates.SelectMany(d => Enumerable.Repeat(new DateTimeOffset(d, TimeSpan.Zero), objectIds.Length)).ToArray();
            _allObjectIds = dates.SelectMany(d => objectIds).ToArray();
            _allValues    = dates.SelectMany((d, i) => values[i]).ToArray();

            using (var fileWriter = new ParquetFileWriter(Filename, CreateFloatColumns(), Compression.Snappy))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();

                using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray());
                    }
                }

                using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        objectIdWriter.WriteBatch(objectIds);
                    }
                }

                using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        valueWriter.WriteBatch(values[i]);
                    }
                }

                fileWriter.Close();
            }

            Console.WriteLine("Wrote {0:N0} rows in {1:N2} sec", _numRows, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
        }
Example #21
0
        public long ParquetSharp()
        {
            using (var fileWriter = new ParquetFileWriter("decimal_timeseries.parquet", new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) }))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var valueWriter    = rowGroupWriter.NextColumn().LogicalWriter <decimal>();

                valueWriter.WriteBatch(_values);

                fileWriter.Close();
            }

            return(new FileInfo("decimal_timeseries.parquet").Length);
        }
Example #22
0
        public static void TestByteBufferOptimisation()
        {
            const int numStrings = 100_000;

            var strings = Enumerable.Range(0, numStrings).Select(i => i.ToString()).ToArray();

            var cancel = new CancellationTokenSource();
            var task   = Task.Run(() =>
            {
                while (!cancel.IsCancellationRequested)
                {
                    GC.Collect();
                    GC.WaitForPendingFinalizers();
                    Thread.Sleep(1);
                }
            });

            using (var buffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(buffer))
                {
                    using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") });

                    using (var groupWriter = fileWriter.AppendRowGroup())
                    {
                        using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>();

                        // Strings to byte arrays memory pooling is done by the ByteBuffer class.
                        // If something is fishy there (e.g. bad memory ownership wrt the GC),
                        // we expect to see consequences here if we write enough strings.
                        // It's not bullet proof, but it has found a few issues.
                        columnWriter.WriteBatch(strings);
                    }

                    fileWriter.Close();
                }

                using var inStream     = new BufferReader(buffer);
                using var fileReader   = new ParquetFileReader(inStream);
                using var groupReader  = fileReader.RowGroup(0);
                using var columnReader = groupReader.Column(0).LogicalReader <string>();

                Assert.AreEqual(strings, columnReader.ReadAll(numStrings));
            }

            cancel.Cancel();
            task.Wait();
        }
Example #23
0
        public static void TestRoundTripBuffered(
            // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0
            [Values(2, 8, 32, 128)] int rowsPerBatch,
            [Values(7, 49, 343, 2401)] int writeBufferLength,
            [Values(11, 121, 1331)] int readBufferLength,
            [Values(true, false)] bool useDictionaryEncoding
            )
        {
            var expectedColumns = CreateExpectedColumns();
            var schemaColumns   = expectedColumns
                                  .Select(c => new Column(c.Values.GetType().GetElementType() ?? throw new InvalidOperationException(), c.Name, c.LogicalTypeOverride))
                                  .ToArray();

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding);
                using var fileWriter       = new ParquetFileWriter(outStream, schemaColumns, writerProperties);
                using var rowGroupWriter   = fileWriter.AppendBufferedRowGroup();

                const int rangeLength = 9;

                for (int r = 0; r < NumRows; r += rangeLength)
                {
                    for (var i = 0; i < expectedColumns.Length; i++)
                    {
                        var column = expectedColumns[i];
                        var range  = (r, Math.Min(r + rangeLength, NumRows));

                        Console.WriteLine("Writing '{0}' (element type: {1}) (range: {2})", column.Name, column.Values.GetType().GetElementType(), range);

                        using var columnWriter = rowGroupWriter.Column(i).LogicalWriter(writeBufferLength);
                        columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch, range));
                    }
                }

                fileWriter.Close();
            }

            Console.WriteLine();

            // Read back the columns and make sure they match.
            AssertReadRoundtrip(rowsPerBatch, readBufferLength, buffer, expectedColumns);
        }
Example #24
0
        public static void TestBigArrayRoundtrip()
        {
            // Create a big array of float arrays. Try to detect buffer-size related issues.
            var m  = 8196;
            var ar = new float[m];

            for (var i = 0; i < m; i++)
            {
                ar[i] = i;
            }

            var n        = 4;
            var expected = new float[n][];

            for (var i = 0; i < n; i++)
            {
                expected[i] = ar;
            }

            using var buffer = new ResizableBuffer();

            // Write out a single column
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <float[]>("big_array_field") });

                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <float[]>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            // Read it back.
            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <float[]>();

            var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);

            Assert.AreEqual(expected, allData);
        }
        private static void TestRoundTripBuffered(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding)
        {
            // Same as the default round-trip test, but use buffered row groups.

            var schema           = CreateSchema(expectedColumns);
            var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding);
            var keyValueMetadata = new Dictionary <string, string> {
                { "case", "Test" }, { "Awesome", "true" }
            };

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata);
                using var rowGroupWriter = fileWriter.AppendBufferedRowGroup();

                const int rangeLength = 9;
                var       numRows     = expectedColumns.First().Values.Length;

                for (int r = 0; r < numRows; r += rangeLength)
                {
                    for (var i = 0; i < expectedColumns.Length; i++)
                    {
                        var column = expectedColumns[i];
                        var range  = (r, Math.Min(r + rangeLength, numRows));

                        if (range.Item1 == 0 || range.Item2 == numRows)
                        {
                            Console.WriteLine("Writing '{0}' (range: {1})", column.Name, range);
                        }

                        using var columnWriter = rowGroupWriter.Column(i);
                        columnWriter.Apply(new ValueSetter(column.Values, range));
                    }
                }

                fileWriter.Close();
            }

            // Read back the columns and make sure they match.
            AssertReadRoundtrip(buffer, expectedColumns, useDictionaryEncoding);
        }
Example #26
0
        public static void TestArrayEdgeCasesRoundtrip()
        {
            /*
             * [None, [], [1.0, None, 2.0]]
             * []
             * None
             * [[]]
             */
            var expected = new double?[][][]
            {
                new double?[][] { null, new double?[] { }, new double?[] { 1.0, null, 2.0 } },
                new double?[][] { },
                null,
                new double?[][] { new double?[] { } }
            };

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <double?[][]>("a") });

                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <double?[][]>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <double?[][]>();

            Assert.AreEqual(4, rowGroup.MetaData.NumRows);
            var allData = columnReader.ReadAll(4);

            Assert.AreEqual(expected, allData);
        }
Example #27
0
        public static unsafe void TestParquetReadFromBuffer()
        {
            var expected = Enumerable.Range(0, 100).ToArray();

            // Write out a single column
            byte[] parquetFileBytes;
            using (var outBuffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(outBuffer))
                {
                    using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") });

                    using (var rowGroupWriter = fileWriter.AppendRowGroup())
                    {
                        using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>();
                        colWriter.WriteBatch(expected);
                    }

                    fileWriter.Close();
                }

                parquetFileBytes = outBuffer.ToArray();
            }

            // Read it back
            fixed(byte *fixedBytes = parquetFileBytes)
            {
                using var buffer       = new IO.Buffer(new IntPtr(fixedBytes), parquetFileBytes.Length);
                using var inStream     = new BufferReader(buffer);
                using var fileReader   = new ParquetFileReader(inStream);
                using var rowGroup     = fileReader.RowGroup(0);
                using var columnReader = rowGroup.Column(0).LogicalReader <int>();

                var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);

                Assert.AreEqual(expected, allData);
            }
        }
        private static ResizableBuffer WriteTestValues <TValue>(TValue[] written)
        {
            var buffer = new ResizableBuffer();

            try
            {
                using var output       = new BufferOutputStream(buffer);
                using var fileWriter   = new ParquetFileWriter(output, new Column[] { new Column <TValue>("values") });
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <TValue>();

                columnWriter.WriteBatch(written);
                fileWriter.Close();

                return(buffer);
            }

            catch
            {
                buffer.Dispose();
                throw;
            }
        }
        public static void TestRoundTrip(
            // 2^i, 7^j, 11^k are mutually co-prime for i,j,k>0
            [Values(2, 8, 32, 128)] int rowsPerBatch,
            [Values(7, 49, 343, 2401)] int writeBufferLength,
            [Values(11, 121, 1331)] int readBufferLength,
            [Values(true, false)] bool useDictionaryEncoding
            )
        {
            var expectedColumns = CreateExpectedColumns();
            var schemaColumns   = expectedColumns.Select(c => new Column(c.Values.GetType().GetElementType(), c.Name, c.LogicalTypeOverride)).ToArray();

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding);
                using var fileWriter       = new ParquetFileWriter(outStream, schemaColumns, writerProperties);
                using var rowGroupWriter   = fileWriter.AppendRowGroup();

                foreach (var column in expectedColumns)
                {
                    Console.WriteLine("Writing '{0}' ({1})", column.Name, column.Values.GetType().GetElementType());

                    using var columnWriter = rowGroupWriter.NextColumn().LogicalWriter(writeBufferLength);
                    columnWriter.Apply(new LogicalValueSetter(column.Values, rowsPerBatch));
                }

                fileWriter.Close();
            }

            Console.WriteLine();

            // Read back the columns and make sure they match.
            AssertReadRoundtrip(rowsPerBatch, readBufferLength, buffer, expectedColumns);
        }
        private static void TestWriteNoWriterOverride <TValue, TCustom>(TValue[] expected, TCustom[] written)
        {
            using var buffer = new ResizableBuffer();

            // Write float values using a custom user-type:
            // - Provide a type factory such that Column<VolumeInDollars> can be converted to the right schema node.
            // - Provide a converter factory such that VolumeInDollars values can be written as floats.
            // - Do not explicitly override the expected type when accessing the LogicalColumnWriter.

            using (var output = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(output, new Column[] { new Column <TCustom>("values") }, new WriteTypeFactory())
                      {
                          LogicalWriteConverterFactory = new WriteConverterFactory()
                      };
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <TCustom>();

                columnWriter.WriteBatch(written);
                fileWriter.Close();
            }

            CheckWrittenValues(buffer, expected);
        }