public static void TestPropertiesBuilder() { var p = new WriterPropertiesBuilder() .Compression(Compression.Snappy) .CompressionLevel(3) .CreatedBy("Meeeee!!!") .DataPagesize(123) .DictionaryPagesizeLimit(456) .Encoding(Encoding.DeltaByteArray) .MaxRowGroupLength(789) .Version(ParquetVersion.PARQUET_1_0) .WriteBatchSize(666) .Build(); Assert.AreEqual("Meeeee!!!", p.CreatedBy); Assert.AreEqual(Compression.Snappy, p.Compression(new ColumnPath("anypath"))); Assert.AreEqual(3, p.CompressionLevel(new ColumnPath("anypath"))); Assert.AreEqual(123, p.DataPageSize); Assert.AreEqual(Encoding.PlainDictionary, p.DictionaryIndexEncoding); Assert.AreEqual(Encoding.PlainDictionary, p.DictionaryPageEncoding); Assert.AreEqual(456, p.DictionaryPagesizeLimit); Assert.AreEqual(789, p.MaxRowGroupLength); Assert.AreEqual(ParquetVersion.PARQUET_1_0, p.Version); Assert.AreEqual(666, p.WriteBatchSize); }
private static WriterProperties CreateWriterProperties() { var builder = new WriterPropertiesBuilder(); builder.Compression(Compression.Snappy); return(builder.Build()); }
private static WriterProperties CreateWriterProperties(FileEncryptionProperties fileEncryptionProperties) { using var builder = new WriterPropertiesBuilder(); return(builder .Compression(Compression.Lz4) .Encryption(fileEncryptionProperties) .Build()); }
public static void TestPropertiesBuilder() { var builder = new WriterPropertiesBuilder(); builder .Compression(Compression.Snappy) .CreatedBy("Meeeee!!!") .DataPagesize(123) .DictionaryPagesizeLimit(456) .Encoding(Encoding.DeltaByteArray) .MaxRowGroupLength(789) .Version(ParquetVersion.PARQUET_1_0) .WriteBatchSize(666) ; var p = builder.Build(); Assert.AreEqual("Meeeee!!!", p.CreatedBy); Assert.AreEqual(123, p.DataPageSize); Assert.AreEqual(Encoding.PlainDictionary, p.DictionaryIndexEncoding); Assert.AreEqual(Encoding.PlainDictionary, p.DictionaryPageEncoding); Assert.AreEqual(456, p.DictionaryPagesizeLimit); Assert.AreEqual(789, p.MaxRowGroupLength); Assert.AreEqual(ParquetVersion.PARQUET_1_0, p.Version); Assert.AreEqual(666, p.WriteBatchSize); /* * public WriterPropertiesBuilder DisableDictionary() * public WriterPropertiesBuilder DisableDictionary(string path) * public WriterPropertiesBuilder EnableDictionary() * public WriterPropertiesBuilder EnableDictionary(string path) * * // Statistics enable/disable * * public WriterPropertiesBuilder DisableStatistics() * public WriterPropertiesBuilder DisableStatistics(string path) * public WriterPropertiesBuilder EnableStatistics() * public WriterPropertiesBuilder EnableStatistics(string path) * * // Other properties * * public WriterPropertiesBuilder Compression(Compression codec) * public WriterPropertiesBuilder Compression(string path, Compression codec) * public WriterPropertiesBuilder CreatedBy(string createdBy) * public WriterPropertiesBuilder DataPagesize(long pageSize) * public WriterPropertiesBuilder DictionaryPagesizeLimit(long dictionaryPagesizeLimit) * public WriterPropertiesBuilder Encoding(Encoding encoding) * public WriterPropertiesBuilder Encoding(string path, Encoding encoding) * public WriterPropertiesBuilder MaxRowGroupLength(long maxRowGroupLength) * public WriterPropertiesBuilder Version(ParquetVersion version) * public WriterPropertiesBuilder WriteBatchSize(long writeBatchSize) */ }
public static void TestByteStreamSplitEncoding() { const int numRows = 10230; var ids = Enumerable.Range(0, numRows).ToArray(); var values = ids.Select(i => i / 3.14f).ToArray(); using var buffer = new ResizableBuffer(); using (var output = new BufferOutputStream(buffer)) { var columns = new Column[] { new Column <int>("id"), new Column <float>("value") }; var p = new WriterPropertiesBuilder() .Compression(Compression.Lz4) .DisableDictionary("value") .Encoding("value", Encoding.ByteStreamSplit) .Build(); using var fileWriter = new ParquetFileWriter(output, columns, p); using var groupWriter = fileWriter.AppendRowGroup(); using var idWriter = groupWriter.NextColumn().LogicalWriter <int>(); idWriter.WriteBatch(ids); using var valueWriter = groupWriter.NextColumn().LogicalWriter <float>(); valueWriter.WriteBatch(values); fileWriter.Close(); } using var input = new BufferReader(buffer); using var fileReader = new ParquetFileReader(input); using var groupReader = fileReader.RowGroup(0); using var metadataId = groupReader.MetaData.GetColumnChunkMetaData(0); using var metadataValue = groupReader.MetaData.GetColumnChunkMetaData(1); Assert.AreEqual(new[] { Encoding.PlainDictionary, Encoding.Plain, Encoding.Rle }, metadataId.Encodings); Assert.AreEqual(new[] { Encoding.ByteStreamSplit, Encoding.Rle }, metadataValue.Encodings); using var idReader = groupReader.Column(0).LogicalReader <int>(); using var valueReader = groupReader.Column(1).LogicalReader <float>(); Assert.AreEqual(ids, idReader.ReadAll(numRows)); Assert.AreEqual(values, valueReader.ReadAll(numRows)); }
private static WriterProperties CreateWriterProperties(ExpectedColumn[] expectedColumns, bool useDictionaryEncoding) { var builder = new WriterPropertiesBuilder(); builder.Compression(Compression.Lz4); if (!useDictionaryEncoding) { foreach (var column in expectedColumns) { builder.DisableDictionary(column.Name); } } return(builder.Build()); }
private static WriterProperties CreateWriterProperties(bool enableDictionary) { using var builder = new WriterPropertiesBuilder(); builder.Compression(Compression.Snappy); return((enableDictionary ? builder : builder.DisableDictionary("value")).Build()); }
public static void TestBigFileBufferedRowGroup() { // Test a large amount of rows with a buffered row group to uncover any particular issue. const int numBatches = 64; const int batchSize = 8192; using var buffer = new ResizableBuffer(); using (var output = new BufferOutputStream(buffer)) { var columns = new Column[] { new Column <int>("int"), new Column <double>("double"), new Column <string>("string"), new Column <bool>("bool") }; using var builder = new WriterPropertiesBuilder(); using var writerProperties = builder.Compression(Compression.Snappy).DisableDictionary("double").Build(); using var fileWriter = new ParquetFileWriter(output, columns, writerProperties); using var rowGroupWriter = fileWriter.AppendBufferedRowGroup(); using var col0 = rowGroupWriter.Column(0).LogicalWriter <int>(); using var col1 = rowGroupWriter.Column(1).LogicalWriter <double>(); using var col2 = rowGroupWriter.Column(2).LogicalWriter <string>(); using var col3 = rowGroupWriter.Column(3).LogicalWriter <bool>(); for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex) { var startIndex = batchSize * batchIndex; col0.WriteBatch(Enumerable.Range(startIndex, batchSize).ToArray()); col1.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray()); col2.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray()); col3.WriteBatch(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray()); } fileWriter.Close(); } using (var input = new BufferReader(buffer)) { using var fileReader = new ParquetFileReader(input); using var rowGroupReader = fileReader.RowGroup(0); using var col0 = rowGroupReader.Column(0).LogicalReader <int>(); using var col1 = rowGroupReader.Column(1).LogicalReader <double>(); using var col2 = rowGroupReader.Column(2).LogicalReader <string>(); using var col3 = rowGroupReader.Column(3).LogicalReader <bool>(); for (var batchIndex = 0; batchIndex < numBatches; ++batchIndex) { var startIndex = batchSize * batchIndex; Assert.AreEqual(Enumerable.Range(startIndex, batchSize).ToArray(), col0.ReadAll(batchSize)); Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => (double)i).ToArray(), col1.ReadAll(batchSize)); Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i.ToString()).ToArray(), col2.ReadAll(batchSize)); Assert.AreEqual(Enumerable.Range(startIndex, batchSize).Select(i => i % 2 == 0).ToArray(), col3.ReadAll(batchSize)); } fileReader.Close(); } }
public static void TestProperties() { // Test the various properties exposed by ParquetFileWriter. using var writerPropertiesBuilder = new WriterPropertiesBuilder(); using var writerProperties = writerPropertiesBuilder .Compression(Compression.Zstd) .DisableDictionary() .CreatedBy("Some crazy unit test") .Build(); var columns = new Column[] { new Column <int>("Index"), new Column <float>("Value") }; var kvm = (IReadOnlyDictionary <string, string>) new Dictionary <string, string> { { "some key", "some value" }, { "α ∧ ¬β", "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm" } }; using var buffer = new ResizableBuffer(); using var outStream = new BufferOutputStream(buffer); using var fileWriter = new ParquetFileWriter(outStream, columns, writerProperties, kvm); Assert.AreEqual(2, fileWriter.NumColumns); Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(0, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); Assert.AreEqual(Column.CreateSchemaNode(columns), fileWriter.Schema.GroupNode); Assert.AreEqual(Column.CreateSchemaNode(columns), fileWriter.Schema.SchemaRoot); Assert.AreEqual(columns[0].Name, fileWriter.ColumnDescriptor(0).Name); Assert.AreEqual(columns[1].Name, fileWriter.ColumnDescriptor(1).Name); Assert.AreEqual(kvm, fileWriter.KeyValueMetadata); Assert.AreEqual(Compression.Zstd, fileWriter.WriterProperties.Compression(new ColumnPath(""))); Assert.AreEqual(false, fileWriter.WriterProperties.DictionaryEnabled(new ColumnPath(""))); Assert.AreEqual("Some crazy unit test", fileWriter.WriterProperties.CreatedBy); using (var groupWriter = fileWriter.AppendRowGroup()) { Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(1, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); using (var writer = groupWriter.NextColumn().LogicalWriter <int>()) { writer.WriteBatch(new[] { 1, 2, 3, 4, 5, 6 }); } Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(1, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); using (var writer = groupWriter.NextColumn().LogicalWriter <float>()) { writer.WriteBatch(new[] { 1f, 2f, 3f, 4f, 5f, 6f }); } Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(1, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); } Assert.AreEqual(0, fileWriter.NumRows); Assert.AreEqual(1, fileWriter.NumRowGroups); Assert.IsNull(fileWriter.FileMetaData); fileWriter.Close(); //Assert.AreEqual(0, fileWriter.NumRows); // 2021-04-08: calling this results in a segfault when the writer has been closed //Assert.AreEqual(1, fileWriter.NumRowGroups); // 2021-04-08: calling this results in a segfault when the writer has been closed Assert.IsNotNull(fileWriter.FileMetaData); Assert.AreEqual(2, fileWriter.FileMetaData?.NumColumns); Assert.AreEqual(6, fileWriter.FileMetaData?.NumRows); Assert.AreEqual(1, fileWriter.FileMetaData?.NumRowGroups); Assert.AreEqual(kvm, fileWriter.FileMetaData?.KeyValueMetadata); }