示例#1
0
        public DecimalRead()
        {
            Console.WriteLine("Writing data...");

            var timer = Stopwatch.StartNew();
            var rand  = new Random(123);

            _values = Enumerable.Range(0, 1_000_000).Select(i =>
            {
                var n    = rand.Next();
                var sign = rand.NextDouble() < 0.5 ? -1M : +1M;
                return(sign * ((decimal)n * n * n) / 1000M);
            }).ToArray();

            using (var fileWriter = new ParquetFileWriter(Filename, new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) }))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var valueWriter    = rowGroupWriter.NextColumn().LogicalWriter <decimal>();
                valueWriter.WriteBatch(_values);
                fileWriter.Close();
            }

            Console.WriteLine("Wrote {0:N0} rows in {1:N2} sec", _values.Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
        }
示例#2
0
        public static unsafe void TestParquetReadFromBuffer()
        {
            var expected = Enumerable.Range(0, 100).ToArray();

            // Write out a single column
            byte[] parquetFileBytes;
            using (var outBuffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(outBuffer))
                    using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") }))
                        using (var rowGroupWriter = fileWriter.AppendRowGroup())
                            using (var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                            {
                                colWriter.WriteBatch(expected);
                            }

                parquetFileBytes = outBuffer.ToArray();
            }

            // Read it back
            fixed(byte *fixedBytes = parquetFileBytes)
            using (var buffer = new IO.Buffer(new IntPtr(fixedBytes), parquetFileBytes.Length))
                using (var inStream = new BufferReader(buffer))
                    using (var fileReader = new ParquetFileReader(inStream))
                        using (var rowGroup = fileReader.RowGroup(0))
                            using (var columnReader = rowGroup.Column(0).LogicalReader <int>())
                            {
                                var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);
                                Assert.AreEqual(expected, allData);
                            }
        }
示例#3
0
        public static void TestWriteLongString()
        {
            const int numStrings = 100;

            // Generate lots of digits of 0.1234567891011121131415...
            var strings = Enumerable.Range(0, numStrings).Select(i => "0." + string.Join("", Enumerable.Range(1, 3500).Select(j => j.ToString())) + "...").ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter   = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") });
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>();

                // Strings to byte arrays memory pooling is done by the ByteBuffer class.
                // If something is fishy there (e.g. bad memory ownership wrt the GC),
                // we expect to see consequences here if we write enough strings.
                // It's not bullet proof, but it has found a few issues.
                columnWriter.WriteBatch(strings);

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var groupReader  = fileReader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReader <string>();

            Assert.AreEqual(strings, columnReader.ReadAll(numStrings));
        }
        public static void TestFileStreamRoundTrip()
        {
            try
            {
                using (var output = new ManagedOutputStream(File.OpenWrite("file.parquet")))
                {
                    using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                    using var groupWriter  = writer.AppendRowGroup();
                    using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                    columnWriter.WriteBatch(new[] { 1, 2, 3 });

                    writer.Close();
                }

                using var input        = new ManagedRandomAccessFile(File.OpenRead("file.parquet"));
                using var reader       = new ParquetFileReader(input);
                using var groupReader  = reader.RowGroup(0);
                using var columnReader = groupReader.Column(0).LogicalReader <int>();

                Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3));
            }
            finally
            {
                File.Delete("file.parquet");
            }
        }
示例#5
0
        public static void TestArrayOfEmptyStringArraysRoundtrip()
        {
            var expected = new[]
            {
                new string[] { },
                new string[] { },
                new string[] { },
                new string[] { }
            };

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string[]>("a") });

                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <string[]>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <string[]>();

            Assert.AreEqual(4, rowGroup.MetaData.NumRows);
            var allData = columnReader.ReadAll(4);

            Assert.AreEqual(expected, allData);
        }
示例#6
0
        public static void TestWriteBatchWithNullOptionalField()
        {
            using (var buffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(buffer))
                    using (var writer = new ParquetFileWriter(outStream, new Column[] { new Column <int?>("int32?") }))
                        using (var rowGroupWriter = writer.AppendRowGroup())
                            using (var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn())
                            {
                                var defLevels = new short[] { 1, 0, 1 };
                                var values    = new[] { 1, 2 };

                                colWriter.WriteBatch(defLevels.Length, defLevels, null, values);
                            }

                using (var inStream = new BufferReader(buffer))
                    using (var reader = new ParquetFileReader(inStream))
                        using (var rowGroupReader = reader.RowGroup(0))
                            using (var colReader = rowGroupReader.Column(0).LogicalReader <int?>())
                            {
                                var results = new int?[3];
                                colReader.ReadBatch(results, 0, 3);

                                Assert.AreEqual(new int?[] { 1, null, 2 }, results);
                            }
            }
        }
        private static void TestRoundTrip(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding)
        {
            var schema           = CreateSchema(expectedColumns);
            var writerProperties = CreateWriterProperties(expectedColumns, useDictionaryEncoding);
            var keyValueMetadata = new Dictionary <string, string> {
                { "case", "Test" }, { "Awesome", "true" }
            };

            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, schema, writerProperties, keyValueMetadata);
                using var rowGroupWriter = fileWriter.AppendRowGroup();

                foreach (var column in expectedColumns)
                {
                    Console.WriteLine("Writing '{0}'", column.Name);

                    using var columnWriter = rowGroupWriter.NextColumn();
                    columnWriter.Apply(new ValueSetter(column.Values));
                }

                fileWriter.Close();
            }

            // Read back the columns and make sure they match.
            AssertReadRoundtrip(buffer, expectedColumns, useDictionaryEncoding);
        }
        public static void TestFileHandleHasBeenReleased()
        {
            var exception = Assert.Throws <InvalidCastException>(() =>
            {
                try
                {
                    using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") }))
                        using (var group = writer.AppendRowGroup())
                            using (var column = group.NextColumn().LogicalWriter <int>())
                            {
                                column.WriteBatch(new[] { 1, 2, 3 });
                            }

                    // Open with the wrong logical reader type on purpose.
                    using (var reader = new ParquetFileReader("file.parquet"))
                        using (var group = reader.RowGroup(0))
                            using (var column = group.Column(0).LogicalReader <float>())
                            {
                                Assert.AreEqual(new[] { 1, 2, 3 }, column.ReadAll(3));
                            }
                }
                finally
                {
                    // This will throw on Windows if the file handle has not been released.
                    File.Delete("file.parquet");
                }
            });

            Assert.AreEqual(
                "Unable to cast object of type " +
                "'ParquetSharp.LogicalColumnReader`3[System.Int32,System.Int32,System.Int32]'" +
                " to type 'ParquetSharp.LogicalColumnReader`1[System.Single]'.",
                exception.Message);
        }
示例#9
0
        public static void TestReadExeption()
        {
            var expected = Enumerable.Range(0, 1024 * 1024).ToArray();

            var exception = Assert.Throws <ParquetException>(() =>
            {
                using (var buffer = new ErroneousReaderStream())
                {
                    using (var output = new ManagedOutputStream(buffer, leaveOpen: true))
                        using (var writer = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") }))
                            using (var group = writer.AppendRowGroup())
                                using (var column = group.NextColumn().LogicalWriter <int>())
                                {
                                    column.WriteBatch(expected);
                                }

                    buffer.Seek(0, SeekOrigin.Begin);

                    using (var input = new ManagedRandomAccessFile(buffer))
                        using (new ParquetFileReader(input))
                        {
                        }
                }
            });

            Assert.That(
                exception.Message,
                Contains.Substring("this is an erroneous reader"));
        }
示例#10
0
        public static void TestHasNext()
        {
            const int numRows       = 5;
            var       schemaColumns = new Column[] { new Column <int>("int32_field") };
            var       values        = Enumerable.Range(0, numRows).ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writer         = new ParquetFileWriter(outStream, schemaColumns);
                using var rowGroupWriter = writer.AppendRowGroup();
                using var colWriter      = (ColumnWriter <int>)rowGroupWriter.NextColumn();

                colWriter.WriteBatch(values);

                writer.Close();
            }

            // Read back the columns and make sure they match.
            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var rowGroupReader = fileReader.RowGroup(0);
            using var column         = (ColumnReader <int>)rowGroupReader.Column(0);

            var read = new int[1024];

            column.ReadBatch(1024, read, out var numValues);

            Assert.AreEqual(numValues, numRows);
            Assert.AreEqual(values, read.AsSpan(0, numRows).ToArray());
            Assert.IsFalse(column.HasNext);
        }
        private void ParquetImpl(ParquetFileWriter fileWriter)
        {
            using var rowGroupWriter = fileWriter.AppendRowGroup();

            using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
            {
                for (int i = 0; i != _dates.Length; ++i)
                {
                    dateTimeWriter.WriteBatch(Enumerable.Repeat(_dates[i], _objectIds.Length).ToArray());
                }
            }

            using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
            {
                for (int i = 0; i != _dates.Length; ++i)
                {
                    objectIdWriter.WriteBatch(_objectIds);
                }
            }

            using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
            {
                for (int i = 0; i != _dates.Length; ++i)
                {
                    valueWriter.WriteBatch(_values[i]);
                }
            }

            fileWriter.Close();
        }
        private static void TestWriteNoColumnNorWriterOverride <TValue, TCustom>(TValue[] expected, TCustom[] written)
        {
            using var buffer = new ResizableBuffer();

            // Write float values using a custom user-type:
            // - Provide explicit schema definition that knows nothing about VolumeInDollars, and states that it's a float column.
            // - Provide a type factory such that Column("values") is known to be of VolumeInDollars,
            //   as we do not explicitly state the expected type when accessing the LogicalColumnWriter.
            // - Provide a converter factory such that VolumeInDollars values can be written as floats.
            // - Do not explicitly override the expected type when accessing the LogicalColumnWriter.

            using (var output = new BufferOutputStream(buffer))
            {
                using var schema           = Column.CreateSchemaNode(new Column[] { new Column <TValue>("values") });
                using var writerProperties = CreateWriterProperties();
                using var fileWriter       = new ParquetFileWriter(output, schema, writerProperties)
                      {
                          LogicalTypeFactory           = new WriteTypeFactoryNoOverride(),
                          LogicalWriteConverterFactory = new WriteConverterFactory()
                      };
                using var groupWriter  = fileWriter.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <TCustom>();

                columnWriter.WriteBatch(written);
                fileWriter.Close();
            }

            CheckWrittenValues(buffer, expected);
        }
示例#13
0
        private void CreateParquetFile(ResizableBuffer buffer)
        {
            using (var output = new BufferOutputStream(buffer))
                using (var fileWriter = new ParquetFileWriter(output, CreateFloatColumns(), keyValueMetadata: _keyValueProperties))
                {
                    using var rowGroupWriter = fileWriter.AppendRowGroup();

                    using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                    {
                        for (int i = 0; i != _dates.Length; ++i)
                        {
                            dateTimeWriter.WriteBatch(Enumerable.Repeat(_dates[i], _objectIds.Length).ToArray());
                        }
                    }

                    using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                    {
                        for (int i = 0; i != _dates.Length; ++i)
                        {
                            objectIdWriter.WriteBatch(_objectIds);
                        }
                    }

                    using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                    {
                        for (int i = 0; i != _dates.Length; ++i)
                        {
                            valueWriter.WriteBatch(_values[i]);
                        }
                    }

                    fileWriter.Close();
                }
        }
示例#14
0
 private ParquetRowWriter(ParquetFileWriter parquetFileWriter, WriteAction writeAction)
 {
     _parquetFileWriter = parquetFileWriter;
     _rowGroupWriter    = _parquetFileWriter.AppendRowGroup();
     _writeAction       = writeAction;
     _rows = new TTuple[1024];
 }
        public static void TestFileHandleHasBeenReleased()
        {
            var exception = Assert.Throws <InvalidCastException>(() =>
            {
                try
                {
                    using (var writer = new ParquetFileWriter("file.parquet", new Column[] { new Column <int>("ids") }))
                    {
                        using var groupWriter  = writer.AppendRowGroup();
                        using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                        columnWriter.WriteBatch(new[] { 1, 2, 3 });

                        writer.Close();
                    }

                    // Open with the wrong logical reader type on purpose.
                    using var reader       = new ParquetFileReader("file.parquet");
                    using var groupReader  = reader.RowGroup(0);
                    using var columnReader = groupReader.Column(0).LogicalReader <float>();

                    Assert.AreEqual(new[] { 1, 2, 3 }, columnReader.ReadAll(3));
                }
                finally
                {
                    // This will throw on Windows if the file handle has not been released.
                    File.Delete("file.parquet");
                }
            });

            StringAssert.StartsWith("Unable to cast object of type", exception?.Message);
        }
示例#16
0
        public static void TestBufferOutputStreamFinish()
        {
            var expected = Enumerable.Range(0, 100).ToArray();

            using var outStream = new BufferOutputStream();

            // Write out a single column
            using (var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("int_field") }))
            {
                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var colWriter = rowGroupWriter.NextColumn().LogicalWriter <int>();
                    colWriter.WriteBatch(expected);
                }

                fileWriter.Close();
            }

            // Read it back
            using var buffer       = outStream.Finish();
            using var inStream     = new BufferReader(buffer);
            using var fileReader   = new ParquetFileReader(inStream);
            using var rowGroup     = fileReader.RowGroup(0);
            using var columnReader = rowGroup.Column(0).LogicalReader <int>();

            var allData = columnReader.ReadAll((int)rowGroup.MetaData.NumRows);

            Assert.AreEqual(expected, allData);
        }
        public static void TestInMemoryRoundTrip()
        {
            var expected = Enumerable.Range(0, 1024 * 1024).ToArray();

            using var buffer = new MemoryStream();

            // Write test data.
            using (var output = new ManagedOutputStream(buffer, leaveOpen: true))
            {
                using var writer       = new ParquetFileWriter(output, new Column[] { new Column <int>("ids") });
                using var groupWriter  = writer.AppendRowGroup();
                using var columnWriter = groupWriter.NextColumn().LogicalWriter <int>();

                columnWriter.WriteBatch(expected);

                writer.Close();
            }

            // Seek back to start.
            buffer.Seek(0, SeekOrigin.Begin);

            // Read test data.
            using var input        = new ManagedRandomAccessFile(buffer, leaveOpen: true);
            using var reader       = new ParquetFileReader(input);
            using var groupReader  = reader.RowGroup(0);
            using var columnReader = groupReader.Column(0).LogicalReader <int>();

            Assert.AreEqual(expected, columnReader.ReadAll(expected.Length));
        }
示例#18
0
        public static void TestAgainstThirdParty()
        {
            var columns = new Column[] { new Column <decimal>("Decimal", LogicalType.Decimal(precision: 29, scale: 3)) };
            var values  = Enumerable.Range(0, 10_000)
                          .Select(i => ((decimal)i * i * i) / 1000 - 10)
                          .Concat(new [] { decimal.MinValue / 1000, decimal.MaxValue / 1000 })
                          .ToArray();

            using var buffer = new ResizableBuffer();

            // Write using ParquetSharp
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, columns, Compression.Snappy);
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var columnWriter   = rowGroupWriter.NextColumn().LogicalWriter <decimal>();

                columnWriter.WriteBatch(values);

                fileWriter.Close();
            }

            // Read using Parquet.NET
            using var memoryStream   = new MemoryStream(buffer.ToArray());
            using var fileReader     = new ParquetReader(memoryStream);
            using var rowGroupReader = fileReader.OpenRowGroupReader(0);

            var read = (decimal[])rowGroupReader.ReadColumn(fileReader.Schema.GetDataFields()[0]).Data;

            Assert.AreEqual(values, read);
        }
示例#19
0
        public static void TestReadWriteParquetMultipleTasks()
        {
            void WriteFile()
            {
                var schema = new Column[]
                {
                    new Column <DateTime>("Col1"),
                    new Column <int>("Col2"),
                    new Column <float>("Col3")
                };

                const int numRowGroups    = 7;
                const int rowsPerRowGroup = 21;
                var       data            = Enumerable.Range(0, rowsPerRowGroup).ToArray();

                using (var writer1 = new ParquetFileWriter(Task.CurrentId + ".parquet", schema))
                {
                    for (var i = 0; i < numRowGroups; i++)
                    {
                        using var rg1 = writer1.AppendRowGroup();

                        using (var col1Rg1 = rg1.NextColumn().LogicalWriter <DateTime>())
                        {
                            col1Rg1.WriteBatch(data.Select(n => new DateTime(2012, 1, 1).AddDays(n)).ToArray());
                        }

                        using (var col1Rg1 = rg1.NextColumn().LogicalWriter <int>())
                        {
                            col1Rg1.WriteBatch(data);
                        }

                        using (var col1Rg1 = rg1.NextColumn().LogicalWriter <float>())
                        {
                            col1Rg1.WriteBatch(data.Select(n => n + 0.1f).ToArray());
                        }
                    }

                    writer1.Close();
                }

                File.Delete(Task.CurrentId + ".parquet");

                Console.WriteLine(Task.CurrentId + " completed.");
            }

            const int numThreads = 14;
            const int numRuns    = 30000;
            var       running    = new Task[numRuns];

            ThreadPool.SetMaxThreads(numThreads, numThreads);

            foreach (var i in Enumerable.Range(0, numRuns))
            {
                running[i] = Task.Factory.StartNew(WriteFile, CancellationToken.None);
            }

            Task.WaitAll(running);
        }
        public static void TestSkip()
        {
            const int numRows = 11;

            var schemaColumns = new Column[] { new Column <int>("int32_field") };
            var values        = Enumerable.Range(0, numRows).ToArray();

            using var buffer = new ResizableBuffer();

            using (var outStream = new BufferOutputStream(buffer))
            {
                using var writer = new ParquetFileWriter(outStream, schemaColumns);

                using (var rowGroupWriter = writer.AppendRowGroup())
                {
                    var colWriter = (ColumnWriter <int>)rowGroupWriter.NextColumn();
                    colWriter.WriteBatch(numRows, values);
                }

                writer.Close();
            }

            using var inStream       = new BufferReader(buffer);
            using var fileReader     = new ParquetFileReader(inStream);
            using var rowGroupReader = fileReader.RowGroup(0);

            // Read back the columns after skipping numRows and make sure the values are what we expect.
            using (var column = rowGroupReader.Column(0))
            {
                const int numToSkip = 5;

                var skipped = column.Skip(numToSkip);

                Assert.AreEqual(numToSkip, skipped);

                var read = new int[1024];
                ((ColumnReader <int>)column).ReadBatch(1024, read, out var numValues);

                Assert.AreEqual(numValues, numRows - numToSkip);
                Assert.AreEqual(values.AsSpan(numToSkip).ToArray(), read.AsSpan(0, numRows - numToSkip).ToArray());
            }

            // Check skipped is bound to the maximum number of rows.
            using (var column = rowGroupReader.Column(0))
            {
                var skipped = column.Skip(1024);

                Assert.AreEqual(numRows, skipped);
                Assert.IsFalse(column.HasNext);
            }
        }
        public static void TestByteStreamSplitEncoding()
        {
            const int numRows = 10230;

            var ids    = Enumerable.Range(0, numRows).ToArray();
            var values = ids.Select(i => i / 3.14f).ToArray();

            using var buffer = new ResizableBuffer();

            using (var output = new BufferOutputStream(buffer))
            {
                var columns = new Column[]
                {
                    new Column <int>("id"),
                    new Column <float>("value")
                };

                var p = new WriterPropertiesBuilder()
                        .Compression(Compression.Lz4)
                        .DisableDictionary("value")
                        .Encoding("value", Encoding.ByteStreamSplit)
                        .Build();

                using var fileWriter  = new ParquetFileWriter(output, columns, p);
                using var groupWriter = fileWriter.AppendRowGroup();

                using var idWriter = groupWriter.NextColumn().LogicalWriter <int>();
                idWriter.WriteBatch(ids);

                using var valueWriter = groupWriter.NextColumn().LogicalWriter <float>();
                valueWriter.WriteBatch(values);

                fileWriter.Close();
            }

            using var input       = new BufferReader(buffer);
            using var fileReader  = new ParquetFileReader(input);
            using var groupReader = fileReader.RowGroup(0);

            using var metadataId    = groupReader.MetaData.GetColumnChunkMetaData(0);
            using var metadataValue = groupReader.MetaData.GetColumnChunkMetaData(1);

            Assert.AreEqual(new[] { Encoding.PlainDictionary, Encoding.Plain, Encoding.Rle }, metadataId.Encodings);
            Assert.AreEqual(new[] { Encoding.ByteStreamSplit, Encoding.Rle }, metadataValue.Encodings);

            using var idReader    = groupReader.Column(0).LogicalReader <int>();
            using var valueReader = groupReader.Column(1).LogicalReader <float>();

            Assert.AreEqual(ids, idReader.ReadAll(numRows));
            Assert.AreEqual(values, valueReader.ReadAll(numRows));
        }
示例#22
0
        public FloatTimeSeriesRead()
        {
            Console.WriteLine("Writing data...");

            var timer = Stopwatch.StartNew();

            DateTime[] dates;
            int[]      objectIds;
            float[][]  values;
            (dates, objectIds, values, _numRows) = CreateFloatDataFrame(3600);

            _allDates = dates.SelectMany(d => Enumerable.Repeat(d, objectIds.Length)).ToArray();
            _allDatesAsDateTimeOffsets = dates.SelectMany(d => Enumerable.Repeat(new DateTimeOffset(d, TimeSpan.Zero), objectIds.Length)).ToArray();
            _allObjectIds = dates.SelectMany(d => objectIds).ToArray();
            _allValues    = dates.SelectMany((d, i) => values[i]).ToArray();

            using (var fileWriter = new ParquetFileWriter(Filename, CreateFloatColumns(), Compression.Snappy))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();

                using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray());
                    }
                }

                using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        objectIdWriter.WriteBatch(objectIds);
                    }
                }

                using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        valueWriter.WriteBatch(values[i]);
                    }
                }

                fileWriter.Close();
            }

            Console.WriteLine("Wrote {0:N0} rows in {1:N2} sec", _numRows, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
        }
示例#23
0
        public static void TestDisposedAccess()
        {
            using var buffer = new ResizableBuffer();

            // Write our expected columns to the parquet in-memory file.
            using var outStream  = new BufferOutputStream(buffer);
            using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <int>("Index") });

            fileWriter.Dispose();

            var exception = Assert.Throws <NullReferenceException>(() => fileWriter.AppendRowGroup());

            Assert.AreEqual("null native handle", exception.Message);
        }
        public static void TestWriteNoConverterFactory()
        {
            // Test that we cannot writer values using a custom type without providing a factory.
            using var buffer           = new ResizableBuffer();
            using var output           = new BufferOutputStream(buffer);
            using var schema           = Column.CreateSchemaNode(new Column[] { new Column <float>("values") });
            using var writerProperties = CreateWriterProperties();
            using var fileWriter       = new ParquetFileWriter(output, schema, writerProperties);
            using var groupWriter      = fileWriter.AppendRowGroup();

            var exception = Assert.Throws <NotSupportedException>(() => groupWriter.NextColumn().LogicalWriterOverride <VolumeInDollars>());

            StringAssert.StartsWith("unsupported logical system type", exception?.Message);
        }
示例#25
0
        public long ParquetSharp()
        {
            using (var fileWriter = new ParquetFileWriter("decimal_timeseries.parquet", new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) }))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var valueWriter    = rowGroupWriter.NextColumn().LogicalWriter <decimal>();

                valueWriter.WriteBatch(_values);

                fileWriter.Close();
            }

            return(new FileInfo("decimal_timeseries.parquet").Length);
        }
示例#26
0
        public static void TestDisposeExceptionSafety_RowGroupWriter()
        {
            var exception = Assert.Throws <Exception>(() =>
            {
                using var buffer      = new ResizableBuffer();
                using var outStream   = new BufferOutputStream(buffer);
                using var fileWriter  = new ParquetFileWriter(outStream, new Column[] { new Column <int>("Index"), new Column <float>("Value") });
                using var groupWriter = fileWriter.AppendRowGroup();

                throw new Exception("this is the expected message");
            });

            Assert.That(exception.Message, Contains.Substring("this is the expected message"));
        }
        public static void TestDecimalSeries([Values(0, 1)] int warmup)
        {
            var timer = Stopwatch.StartNew();
            var rand  = new Random(123);

            Console.WriteLine("Generating data...");

            var values = Enumerable.Range(0, 10_000_000).Select(i =>
            {
                var n    = rand.Next();
                var sign = rand.NextDouble() < 0.5 ? -1M : +1M;
                return(sign * ((decimal)n * n * n) / 1000M);
            }).ToArray();

            Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", values.Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet");

            timer.Restart();

            using (var fileWriter = new ParquetFileWriter("decimal_timeseries.parquet", new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) }))
            {
                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <decimal>();
                    valueWriter.WriteBatch(values);
                }

                fileWriter.Close();
            }

            Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", new FileInfo("decimal_timeseries.parquet").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.NET");

            timer.Restart();

            {
                var valueField = new DecimalDataField("Value", precision: 29, scale: 3);
                var schema     = new Parquet.Data.Schema(valueField);

                using var stream        = File.Create("decimal_timeseries.parquet.net");
                using var parquetWriter = new ParquetWriter(schema, stream);
                using var groupWriter   = parquetWriter.CreateRowGroup();

                groupWriter.WriteColumn(new DataColumn(valueField, values));
            }

            Console.WriteLine("Saved to Parquet.NET ({0:N0} bytes) in {1:N2} sec", new FileInfo("decimal_timeseries.parquet.net").Length, timer.Elapsed.TotalSeconds);
        }
示例#28
0
        public static void TestByteBufferOptimisation()
        {
            const int numStrings = 100_000;

            var strings = Enumerable.Range(0, numStrings).Select(i => i.ToString()).ToArray();

            var cancel = new CancellationTokenSource();
            var task   = Task.Run(() =>
            {
                while (!cancel.IsCancellationRequested)
                {
                    GC.Collect();
                    GC.WaitForPendingFinalizers();
                    Thread.Sleep(1);
                }
            });

            using (var buffer = new ResizableBuffer())
            {
                using (var outStream = new BufferOutputStream(buffer))
                {
                    using var fileWriter = new ParquetFileWriter(outStream, new Column[] { new Column <string>("Name") });

                    using (var groupWriter = fileWriter.AppendRowGroup())
                    {
                        using var columnWriter = groupWriter.NextColumn().LogicalWriter <string>();

                        // Strings to byte arrays memory pooling is done by the ByteBuffer class.
                        // If something is fishy there (e.g. bad memory ownership wrt the GC),
                        // we expect to see consequences here if we write enough strings.
                        // It's not bullet proof, but it has found a few issues.
                        columnWriter.WriteBatch(strings);
                    }

                    fileWriter.Close();
                }

                using var inStream     = new BufferReader(buffer);
                using var fileReader   = new ParquetFileReader(inStream);
                using var groupReader  = fileReader.RowGroup(0);
                using var columnReader = groupReader.Column(0).LogicalReader <string>();

                Assert.AreEqual(strings, columnReader.ReadAll(numStrings));
            }

            cancel.Cancel();
            task.Wait();
        }
示例#29
0
        private static void WriteParquetFile(BufferOutputStream output, FileEncryptionProperties fileEncryptionProperties)
        {
            using var writerProperties = CreateWriterProperties(fileEncryptionProperties);
            using var fileWriter       = new ParquetFileWriter(output, Columns, writerProperties);
            using var groupWriter      = fileWriter.AppendRowGroup();

            using (var idWriter = groupWriter.NextColumn().LogicalWriter <int>())
            {
                idWriter.WriteBatch(Ids);
            }

            using (var valueWriter = groupWriter.NextColumn().LogicalWriter <float>())
            {
                valueWriter.WriteBatch(Values);
            }
        }
        public static void TestWriteExplicitSchemaNoTypeFactory()
        {
            // Test that we cannot write values using a custom type without providing a factory.
            using var buffer           = new ResizableBuffer();
            using var output           = new BufferOutputStream(buffer);
            using var schema           = Column.CreateSchemaNode(new Column[] { new Column <float>("values") });
            using var writerProperties = CreateWriterProperties();
            using var fileWriter       = new ParquetFileWriter(output, schema, writerProperties);
            using var groupWriter      = fileWriter.AppendRowGroup();

            var exception = Assert.Throws <InvalidCastException>(() =>
            {
                using var writer = groupWriter.NextColumn().LogicalWriter <VolumeInDollars>();
            });

            StringAssert.StartsWith("Unable to cast object of type 'ParquetSharp.LogicalColumnWriter`3[System.Single,System.Single,System.Single]", exception?.Message);
        }