public void Writing_another_chunk_validates_schema() { var ds1 = new DataSet(new SchemaElement <int>("id")); var ds2 = new DataSet(new SchemaElement <int>("id1")); using (var ms = new MemoryStream()) { using (var ps = new ParquetWriter(ms)) { ps.Write(ds1); Assert.Throws <ParquetException>(() => ps.Write(ds2)); } } }
public void Write_and_read_nullable_integers() { var ds = new DataSet(new DataField <int?>("id")) { 1, 2, 3, (object)null, 4, (object)null, 5 }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(1, ds1[0].GetInt(0)); Assert.Equal(2, ds1[1].GetInt(0)); Assert.Equal(3, ds1[2].GetInt(0)); Assert.True(ds1[3].IsNullAt(0)); Assert.Equal(4, ds1[4].GetInt(0)); Assert.True(ds1[5].IsNullAt(0)); Assert.Equal(5, ds1[6].GetInt(0)); }
public void Type_writes_and_reads_end_to_end(SchemaElement schema, object value, string name = null) { var ds = new DataSet(schema) { new Row(value) }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); object expectedValue = ds[0][0]; object actualValue = ds1[0][0]; if (schema.ElementType == typeof(DateTime)) { actualValue = ((DateTimeOffset)actualValue).DateTime; } Assert.True(expectedValue.Equals(actualValue), $"{name}| expected: {expectedValue}, actual: {actualValue}, schema element: {schema}"); //if (schema.ElementType == typeof(decimal)) ParquetWriter.WriteFile(ds1, "c:\\tmp\\decimals.parquet"); }
public void Write_and_read_nullable_integers() { var ds = new DataSet(new SchemaElement <int>("id")) { 1, 2, 3, (object)null, 4, (object)null, 5 }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(ds1[0].GetInt(0), 1); Assert.Equal(ds1[1].GetInt(0), 2); Assert.Equal(ds1[2].GetInt(0), 3); Assert.True(ds1[3].IsNullAt(0)); Assert.Equal(ds1[4].GetInt(0), 4); Assert.True(ds1[5].IsNullAt(0)); Assert.Equal(ds1[6].GetInt(0), 5); }
public void All_compression_methods_supported(CompressionMethod compressionMethod) { //v2 var ms = new MemoryStream(); DataSet ds1 = new DataSet(new DataField <int>("id")); DataSet ds2; ds1.Add(5); //write using (var writer = new ParquetWriter(ms)) { writer.Write(ds1, CompressionMethod.Gzip); } //read back using (var reader = new ParquetReader(ms)) { ms.Position = 0; ds2 = reader.Read(); } Assert.Equal(5, ds2[0].GetInt(0)); //v3 const int value = 5; object actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod); Assert.Equal(5, (int)actual); }
public void I_can_write_snappy_and_read_back() { var ms = new MemoryStream(); var ds1 = new DataSet( new DataField<int>("id"), new DataField<int>("no")); ds1.Add(1, 3); ds1.Add(2, 4); DataSet ds2; //write using (var writer = new ParquetWriter(ms)) { writer.Write(ds1, CompressionMethod.Snappy); } //read back using (var reader = new ParquetReader(ms)) { ms.Position = 0; ds2 = reader.Read(); } Assert.Equal(1, ds2[0].GetInt(0)); Assert.Equal(2, ds2[1].GetInt(0)); Assert.Equal(3, ds2[0].GetInt(1)); Assert.Equal(4, ds2[1].GetInt(1)); }
public void Append_to_file_works_for_all_data_types() { var ms = new MemoryStream(); var schema = new Schema(); schema.Elements.Add(new SchemaElement <int>("Id")); schema.Elements.Add(new SchemaElement <DateTime>("Timestamp")); schema.Elements.Add(new SchemaElement <DateTimeOffset>("Timestamp2")); schema.Elements.Add(new SchemaElement <string>("Message")); schema.Elements.Add(new SchemaElement <byte[]>("Data")); schema.Elements.Add(new SchemaElement <bool>("IsDeleted")); schema.Elements.Add(new SchemaElement <float>("Amount")); schema.Elements.Add(new SchemaElement <decimal>("TotalAmount")); schema.Elements.Add(new SchemaElement <long>("Counter")); schema.Elements.Add(new SchemaElement <double>("Amount2")); schema.Elements.Add(new SchemaElement <byte>("Flag")); schema.Elements.Add(new SchemaElement <sbyte>("Flag2")); schema.Elements.Add(new SchemaElement <short>("Flag3")); schema.Elements.Add(new SchemaElement <ushort>("Flag4")); var ds1 = new DataSet(schema); ds1.Add(1, DateTime.Now, DateTimeOffset.Now, "Record1", System.Text.Encoding.ASCII.GetBytes("SomeData"), false, 123.4f, 200M, 100000L, 1331313D, (byte)1, (sbyte)-1, (short)-500, (ushort)500); ds1.Add(1, DateTime.Now, DateTimeOffset.Now, "Record2", System.Text.Encoding.ASCII.GetBytes("SomeData2"), false, 124.4f, 300M, 200000L, 2331313D, (byte)2, (sbyte)-2, (short)-400, (ushort)400); ParquetWriter.Write(ds1, ms, CompressionMethod.Snappy, null, null, false); var ds2 = new DataSet(schema); ds2.Add(1, DateTime.Now, DateTimeOffset.Now, "Record3", System.Text.Encoding.ASCII.GetBytes("SomeData3"), false, 125.4f, 400M, 300000L, 3331313D, (byte)3, (sbyte)-3, (short)-600, (ushort)600); ds2.Add(1, DateTime.Now, DateTimeOffset.Now, "Record4", System.Text.Encoding.ASCII.GetBytes("SomeData4"), false, 126.4f, 500M, 400000L, 4331313D, (byte)4, (sbyte)-4, (short)-700, (ushort)700); ParquetWriter.Write(ds2, ms, CompressionMethod.Snappy, null, null, true); }
public void Flat_write_read() { var table = new Table(new Schema(new DataField <int>("id"), new DataField <string>("city"))); var ms = new MemoryStream(); //generate fake data for (int i = 0; i < 1000; i++) { table.Add(new Row(i, "record#" + i)); } //write to stream using (var writer = new ParquetWriter(table.Schema, ms)) { writer.Write(table); } //read back into table ms.Position = 0; Table table2; using (var reader = new ParquetReader(ms)) { table2 = reader.ReadAsTable(); } //validate data Assert.True(table.Equals(table2, true)); }
public void Special_read_file_with_multiple_row_groups() { var ms = new MemoryStream(); //create multirowgroup file //first row group var t = new Table(new DataField <int>("id")); t.Add(1); t.Add(2); using (var writer = new ParquetWriter(t.Schema, ms)) { writer.Write(t); } //second row group t.Clear(); t.Add(3); t.Add(4); using (var writer = new ParquetWriter(t.Schema, ms, null, true)) { writer.Write(t); } //read back as table t = ParquetReader.ReadTableFromStream(ms); Assert.Equal(4, t.Count); }
public static void CreateParquetFile(Stream inStream, Stream outStream) { using (var writer = new ParquetWriter(outStream)) { DataSet ds = null; int recordCount = 0; foreach (var data in ReadFile(inStream)) { if (recordCount == 0) { List <Parquet.Data.Field> fields = new List <Parquet.Data.Field>(); foreach (var prop in data.Properties) { fields.Add(new DataField(prop.Key, prop.Value.GetType())); } foreach (var prop in data.SystemProperties) { fields.Add(new DataField(prop.Key, prop.Value.GetType())); } fields.Add(new DataField <byte[]>("Body")); ds = new DataSet(fields.ToArray()); } List <Object> values = new List <object>(); values.AddRange(data.Properties.Values); values.AddRange(data.SystemProperties.Values); values.Add(data.Body.ToArray()); ds.Add(values.ToArray()); recordCount++; } writer.Write(ds); } }
public void Write_datetimeoffset() { var ds = new DataSet( new SchemaElement <DateTimeOffset>("timestamp_col") ) { new DateTimeOffset(new DateTime(2017, 1, 1, 12, 13, 22)), new DateTimeOffset(new DateTime(2017, 1, 1, 12, 13, 23)) }; //8 values for each column var uncompressed = new MemoryStream(); using (var writer = new ParquetWriter(uncompressed)) { writer.Write(ds, CompressionMethod.None); } #if DEBUG const string path = "c:\\tmp\\first.parquet"; F.WriteAllBytes(path, uncompressed.ToArray()); #endif }
public void Array_write_read() { var table = new Table( new Schema( new DataField <int>("id"), new DataField <string[]>("categories") //array field ) ); var ms = new MemoryStream(); table.Add(1, new[] { "1", "2", "3" }); table.Add(3, new[] { "3", "3", "3" }); //write to stream using (var writer = new ParquetWriter(table.Schema, ms)) { writer.Write(table); } //System.IO.File.WriteAllBytes("c:\\tmp\\1.parquet", ms.ToArray()); //read back into table ms.Position = 0; Table table2; using (var reader = new ParquetReader(ms)) { table2 = reader.ReadAsTable(); } //validate data Assert.Equal(table.ToString(), table2.ToString(), ignoreLineEndingDifferences: true); }
public void All_compression_methods_supported(CompressionMethod compressionMethod) { //v2 var ms = new MemoryStream(); DataSet ds1 = new DataSet(new DataField <int>("id")); DataSet ds2; ds1.Add(5); //write using (var writer = new ParquetWriter(ms)) { writer.Write(ds1, compressionMethod); } //read back using (var reader = new ParquetReader(ms)) { ms.Position = 0; ds2 = reader.Read(); } Assert.Equal(5, ds2[0].GetInt(0)); //v3 //looks like writing is not working in certain scenarios! //broken length: 177 //correct length: 187 const int value = 5; object actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod); Assert.Equal(5, (int)actual); }
public void Write_int64datetimeoffset() { var element = new SchemaElement <DateTimeOffset>("timestamp_col"); /*{ * ThriftConvertedType = ConvertedType.TIMESTAMP_MILLIS, * ThriftOriginalType = Type.INT64 * };*/ var ds = new DataSet( element ) { new DateTimeOffset(new DateTime(2017, 1, 1, 12, 13, 22)), new DateTimeOffset(new DateTime(2017, 1, 1, 12, 13, 24)) }; //8 values for each column var uncompressed = new MemoryStream(); using (var writer = new ParquetWriter(uncompressed)) { writer.Write(ds, CompressionMethod.None); } }
public void List_simple_element_write_read() { var table = new Table( new Schema( new DataField <int>("id"), new ListField("cities", new DataField <string>("name")))); var ms = new MemoryStream(); table.Add(1, new[] { "London", "Derby" }); table.Add(2, new[] { "Paris", "New York" }); //write as table using (var writer = new ParquetWriter(table.Schema, ms)) { writer.Write(table); } //read back into table ms.Position = 0; Table table2; using (var reader = new ParquetReader(ms)) { table2 = reader.ReadAsTable(); } //validate data Assert.Equal(table.ToString(), table2.ToString(), ignoreLineEndingDifferences: true); }
public void Write_different_compressions() { var ds = new DataSet( new SchemaElement <int>("id"), new SchemaElement <bool>("bool_col"), new SchemaElement <string>("string_col") ) { //8 values for each column { 4, true, "0" }, { 5, false, "1" }, { 6, true, "0" }, { 7, false, "1" }, { 2, true, "0" }, { 3, false, "1" }, { 0, true, "0" }, { 1, false, "0" } }; var uncompressed = new MemoryStream(); ParquetWriter.Write(ds, uncompressed, CompressionMethod.None); var compressed = new MemoryStream(); ParquetWriter.Write(ds, compressed, CompressionMethod.Gzip); var compressedSnappy = new MemoryStream(); ParquetWriter.Write(ds, compressedSnappy, CompressionMethod.Snappy); }
private void CompareWithMr(Table t) { string testFileName = Path.GetFullPath("temp.parquet"); if (F.Exists(testFileName)) { F.Delete(testFileName); } //produce file using (Stream s = F.OpenWrite(testFileName)) { using (var writer = new ParquetWriter(t.Schema, s)) { writer.Write(t); } } //read back Table t2 = ParquetReader.ReadTableFromFile(testFileName); //check we don't have a bug internally before launching MR Assert.Equal(t.ToString("j"), t2.ToString("j"), ignoreLineEndingDifferences: true); string mrJson = ExecAndGetOutput(_javaExecName, $"-jar {_toolsJarPath} cat -j {testFileName}"); Assert.Equal(t.ToString("j"), mrJson); }
public override void Close() { _writer.Write(_ds); _writer.Dispose(); //_tempStream.Position = 0; //_tempStream.CopyTo(_resultStream); }
public static DataSet WriteRead(DataSet original, WriterOptions writerOptions = null) { var ms = new MemoryStream(); ParquetWriter.Write(original, ms, CompressionMethod.None, null, writerOptions); ms.Position = 0; return(ParquetReader.Read(ms)); }
public static DataSet WriteReadOpt(DataSet original, WriterOptions writerOptions = null) { var ms = new MemoryStream(); ParquetWriter.Write(original, ms, CompressionMethod.None, null, writerOptions); ms.Flush(); //System.IO.File.WriteAllBytes("c:\\tmp\\wr.parquet", ms.ToArray()); ms.Position = 0; return(ParquetReader.Read(ms)); }
private void FlushDataSet() { if (_ds == null) { return; } _writer.Write(_ds); _ds = null; }
public void Reads_created_by_metadata() { DataSet ds = DataSetGenerator.Generate(10); var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.StartsWith("parquet-dotnet", ds1.Metadata.CreatedBy); }
public void Write_in_small_chunks_to_forward_only_stream() { var ms = new MemoryStream(); var forwardOnly = new WriteableNonSeekableStream(ms); var ds = new DataSet( new SchemaElement <int>("id"), new SchemaElement <string>("nonsense")); ds.Add(1, Generator.RandomString); using (var writer = new ParquetWriter(forwardOnly)) { writer.Write(ds); writer.Write(ds); writer.Write(ds); } ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(3, ds1.RowCount); }
public void Floats() { var ds = new DataSet(new SchemaElement <float>("f")); ds.Add((float)1.23); var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(ds[0].GetFloat(0), ds1[0].GetFloat(0)); }
public void Doubles() { var ds = new DataSet(new SchemaElement <double>("d")); ds.Add((double)12.34); var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(ds[0].GetDouble(0), ds1[0].GetDouble(0)); }
public void Read_from_negative_offset_fails() { DataSet ds = DataSetGenerator.Generate(15); var wo = new WriterOptions { RowGroupsSize = 5 }; var ro = new ReaderOptions { Offset = -4, Count = 2 }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms, CompressionMethod.None, null, wo); ms.Position = 0; Assert.Throws <ParquetException>(() => ParquetReader.Read(ms, null, ro)); }
public void Append_to_file_with_different_schema_fails() { var ms = new MemoryStream(); var ds1 = new DataSet(new SchemaElement <int>("id")); ds1.Add(1); ds1.Add(2); ParquetWriter.Write(ds1, ms); //append to file var ds2 = new DataSet(new SchemaElement <double>("id")); ds2.Add(3d); ds2.Add(4d); Assert.Throws <ParquetException>(() => ParquetWriter.Write(ds2, ms, CompressionMethod.Gzip, null, null, true)); }
public void Write_different_compressions() { var ds = new DataSet( new SchemaElement <int>("id"), new SchemaElement <bool>("bool_col"), new SchemaElement <string>("string_col") ); //8 values for each column ds.Add(4, true, "0"); ds.Add(5, false, "1"); ds.Add(6, true, "0"); ds.Add(7, false, "1"); ds.Add(2, true, "0"); ds.Add(3, false, "1"); ds.Add(0, true, "0"); ds.Add(1, false, "0"); var uncompressed = new MemoryStream(); using (var writer = new ParquetWriter(uncompressed)) { writer.Write(ds, CompressionMethod.None); } var compressed = new MemoryStream(); using (var writer = new ParquetWriter(compressed)) { writer.Write(ds, CompressionMethod.Gzip); } var compressedSnappy = new MemoryStream(); using (var writer = new ParquetWriter(compressedSnappy)) { writer.Write(ds, CompressionMethod.Snappy); } #if DEBUG const string path = "c:\\tmp\\first.parquet"; F.WriteAllBytes(path, uncompressed.ToArray()); #endif }
public void Type_write_byte_and_short_byte() { var schema = new Schema(new SchemaElement <sbyte>("sbyte"), new SchemaElement <byte>("byte")); var ds = new DataSet(schema) { { (sbyte)121, (byte)122 } }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(121, (sbyte)ds1[0][0]); Assert.Equal(122, (byte)ds1[0][1]); }
/// <summary> /// Graba el archivo /// </summary> protected override void SaveFile(LibLogger.Models.Log.BlockLogModel block, string fileName) { // Graba el archivo using (CsvReader reader = new CsvReader(FileName, FileParameters, FileColumns)) { using (ParquetWriter writer = new ParquetWriter(fileName)) { // Log writer.Progress += (sender, args) => block.Progress(System.IO.Path.GetFileName(fileName), args.Records, args.Records + 1); // Escribe el archivo writer.Write(reader); } } // Log block.Progress(System.IO.Path.GetFileName(fileName), 0, 0); block.Info($"Fin de la grabaciĆ³n del archivo '{fileName}'"); SolutionViewModel.MainController.Logger.Flush(); }