Exemple #1
0
        public void Writing_another_chunk_validates_schema()
        {
            var ds1 = new DataSet(new SchemaElement <int>("id"));
            var ds2 = new DataSet(new SchemaElement <int>("id1"));

            using (var ms = new MemoryStream())
            {
                using (var ps = new ParquetWriter(ms))
                {
                    ps.Write(ds1);

                    Assert.Throws <ParquetException>(() => ps.Write(ds2));
                }
            }
        }
        public void Write_and_read_nullable_integers()
        {
            var ds = new DataSet(new DataField <int?>("id"))
            {
                1,
                2,
                3,
                (object)null,
                4,
                (object)null,
                5
            };
            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(1, ds1[0].GetInt(0));
            Assert.Equal(2, ds1[1].GetInt(0));
            Assert.Equal(3, ds1[2].GetInt(0));
            Assert.True(ds1[3].IsNullAt(0));
            Assert.Equal(4, ds1[4].GetInt(0));
            Assert.True(ds1[5].IsNullAt(0));
            Assert.Equal(5, ds1[6].GetInt(0));
        }
        public void Type_writes_and_reads_end_to_end(SchemaElement schema, object value, string name = null)
        {
            var ds = new DataSet(schema)
            {
                new Row(value)
            };
            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            object expectedValue = ds[0][0];
            object actualValue   = ds1[0][0];

            if (schema.ElementType == typeof(DateTime))
            {
                actualValue = ((DateTimeOffset)actualValue).DateTime;
            }

            Assert.True(expectedValue.Equals(actualValue),
                        $"{name}| expected: {expectedValue}, actual: {actualValue}, schema element: {schema}");

            //if (schema.ElementType == typeof(decimal)) ParquetWriter.WriteFile(ds1, "c:\\tmp\\decimals.parquet");
        }
Exemple #4
0
        public void Write_and_read_nullable_integers()
        {
            var ds = new DataSet(new SchemaElement <int>("id"))
            {
                1,
                2,
                3,
                (object)null,
                4,
                (object)null,
                5
            };
            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(ds1[0].GetInt(0), 1);
            Assert.Equal(ds1[1].GetInt(0), 2);
            Assert.Equal(ds1[2].GetInt(0), 3);
            Assert.True(ds1[3].IsNullAt(0));
            Assert.Equal(ds1[4].GetInt(0), 4);
            Assert.True(ds1[5].IsNullAt(0));
            Assert.Equal(ds1[6].GetInt(0), 5);
        }
        public void All_compression_methods_supported(CompressionMethod compressionMethod)
        {
            //v2
            var     ms  = new MemoryStream();
            DataSet ds1 = new DataSet(new DataField <int>("id"));
            DataSet ds2;

            ds1.Add(5);

            //write
            using (var writer = new ParquetWriter(ms))
            {
                writer.Write(ds1, CompressionMethod.Gzip);
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                ms.Position = 0;
                ds2         = reader.Read();
            }

            Assert.Equal(5, ds2[0].GetInt(0));

            //v3
            const int value  = 5;
            object    actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod);

            Assert.Equal(5, (int)actual);
        }
Exemple #6
0
      public void I_can_write_snappy_and_read_back()
      {
         var ms = new MemoryStream();
         var ds1 = new DataSet(
            new DataField<int>("id"),
            new DataField<int>("no"));

         ds1.Add(1, 3);
         ds1.Add(2, 4);

         DataSet ds2;

         //write
         using (var writer = new ParquetWriter(ms))
         {
            writer.Write(ds1, CompressionMethod.Snappy);
         }

         //read back
         using (var reader = new ParquetReader(ms))
         {
            ms.Position = 0;
            ds2 = reader.Read();
         }

         Assert.Equal(1, ds2[0].GetInt(0));
         Assert.Equal(2, ds2[1].GetInt(0));
         Assert.Equal(3, ds2[0].GetInt(1));
         Assert.Equal(4, ds2[1].GetInt(1));
      }
Exemple #7
0
        public void Append_to_file_works_for_all_data_types()
        {
            var ms = new MemoryStream();

            var schema = new Schema();

            schema.Elements.Add(new SchemaElement <int>("Id"));
            schema.Elements.Add(new SchemaElement <DateTime>("Timestamp"));
            schema.Elements.Add(new SchemaElement <DateTimeOffset>("Timestamp2"));
            schema.Elements.Add(new SchemaElement <string>("Message"));
            schema.Elements.Add(new SchemaElement <byte[]>("Data"));
            schema.Elements.Add(new SchemaElement <bool>("IsDeleted"));
            schema.Elements.Add(new SchemaElement <float>("Amount"));
            schema.Elements.Add(new SchemaElement <decimal>("TotalAmount"));
            schema.Elements.Add(new SchemaElement <long>("Counter"));
            schema.Elements.Add(new SchemaElement <double>("Amount2"));
            schema.Elements.Add(new SchemaElement <byte>("Flag"));
            schema.Elements.Add(new SchemaElement <sbyte>("Flag2"));
            schema.Elements.Add(new SchemaElement <short>("Flag3"));
            schema.Elements.Add(new SchemaElement <ushort>("Flag4"));

            var ds1 = new DataSet(schema);

            ds1.Add(1, DateTime.Now, DateTimeOffset.Now, "Record1", System.Text.Encoding.ASCII.GetBytes("SomeData"), false, 123.4f, 200M, 100000L, 1331313D, (byte)1, (sbyte)-1, (short)-500, (ushort)500);
            ds1.Add(1, DateTime.Now, DateTimeOffset.Now, "Record2", System.Text.Encoding.ASCII.GetBytes("SomeData2"), false, 124.4f, 300M, 200000L, 2331313D, (byte)2, (sbyte)-2, (short)-400, (ushort)400);

            ParquetWriter.Write(ds1, ms, CompressionMethod.Snappy, null, null, false);

            var ds2 = new DataSet(schema);

            ds2.Add(1, DateTime.Now, DateTimeOffset.Now, "Record3", System.Text.Encoding.ASCII.GetBytes("SomeData3"), false, 125.4f, 400M, 300000L, 3331313D, (byte)3, (sbyte)-3, (short)-600, (ushort)600);
            ds2.Add(1, DateTime.Now, DateTimeOffset.Now, "Record4", System.Text.Encoding.ASCII.GetBytes("SomeData4"), false, 126.4f, 500M, 400000L, 4331313D, (byte)4, (sbyte)-4, (short)-700, (ushort)700);

            ParquetWriter.Write(ds2, ms, CompressionMethod.Snappy, null, null, true);
        }
        public void Flat_write_read()
        {
            var table = new Table(new Schema(new DataField <int>("id"), new DataField <string>("city")));
            var ms    = new MemoryStream();

            //generate fake data
            for (int i = 0; i < 1000; i++)
            {
                table.Add(new Row(i, "record#" + i));
            }

            //write to stream
            using (var writer = new ParquetWriter(table.Schema, ms))
            {
                writer.Write(table);
            }

            //read back into table
            ms.Position = 0;
            Table table2;

            using (var reader = new ParquetReader(ms))
            {
                table2 = reader.ReadAsTable();
            }

            //validate data
            Assert.True(table.Equals(table2, true));
        }
        public void Special_read_file_with_multiple_row_groups()
        {
            var ms = new MemoryStream();

            //create multirowgroup file

            //first row group
            var t = new Table(new DataField <int>("id"));

            t.Add(1);
            t.Add(2);
            using (var writer = new ParquetWriter(t.Schema, ms))
            {
                writer.Write(t);
            }

            //second row group
            t.Clear();
            t.Add(3);
            t.Add(4);
            using (var writer = new ParquetWriter(t.Schema, ms, null, true))
            {
                writer.Write(t);
            }

            //read back as table
            t = ParquetReader.ReadTableFromStream(ms);
            Assert.Equal(4, t.Count);
        }
Exemple #10
0
 public static void CreateParquetFile(Stream inStream, Stream outStream)
 {
     using (var writer = new ParquetWriter(outStream))
     {
         DataSet ds          = null;
         int     recordCount = 0;
         foreach (var data in ReadFile(inStream))
         {
             if (recordCount == 0)
             {
                 List <Parquet.Data.Field> fields = new List <Parquet.Data.Field>();
                 foreach (var prop in data.Properties)
                 {
                     fields.Add(new DataField(prop.Key, prop.Value.GetType()));
                 }
                 foreach (var prop in data.SystemProperties)
                 {
                     fields.Add(new DataField(prop.Key, prop.Value.GetType()));
                 }
                 fields.Add(new DataField <byte[]>("Body"));
                 ds = new DataSet(fields.ToArray());
             }
             List <Object> values = new List <object>();
             values.AddRange(data.Properties.Values);
             values.AddRange(data.SystemProperties.Values);
             values.Add(data.Body.ToArray());
             ds.Add(values.ToArray());
             recordCount++;
         }
         writer.Write(ds);
     }
 }
Exemple #11
0
        public void Write_datetimeoffset()
        {
            var ds = new DataSet(
                new SchemaElement <DateTimeOffset>("timestamp_col")
                )
            {
                new DateTimeOffset(new DateTime(2017, 1, 1, 12, 13, 22)),
                new DateTimeOffset(new DateTime(2017, 1, 1, 12, 13, 23))
            };

            //8 values for each column


            var uncompressed = new MemoryStream();

            using (var writer = new ParquetWriter(uncompressed))
            {
                writer.Write(ds, CompressionMethod.None);
            }

#if DEBUG
            const string path = "c:\\tmp\\first.parquet";
            F.WriteAllBytes(path, uncompressed.ToArray());
#endif
        }
        public void Array_write_read()
        {
            var table = new Table(
                new Schema(
                    new DataField <int>("id"),
                    new DataField <string[]>("categories") //array field
                    )
                );
            var ms = new MemoryStream();

            table.Add(1, new[] { "1", "2", "3" });
            table.Add(3, new[] { "3", "3", "3" });

            //write to stream
            using (var writer = new ParquetWriter(table.Schema, ms))
            {
                writer.Write(table);
            }

            //System.IO.File.WriteAllBytes("c:\\tmp\\1.parquet", ms.ToArray());

            //read back into table
            ms.Position = 0;
            Table table2;

            using (var reader = new ParquetReader(ms))
            {
                table2 = reader.ReadAsTable();
            }

            //validate data
            Assert.Equal(table.ToString(), table2.ToString(), ignoreLineEndingDifferences: true);
        }
        public void All_compression_methods_supported(CompressionMethod compressionMethod)
        {
            //v2
            var     ms  = new MemoryStream();
            DataSet ds1 = new DataSet(new DataField <int>("id"));
            DataSet ds2;

            ds1.Add(5);

            //write
            using (var writer = new ParquetWriter(ms))
            {
                writer.Write(ds1, compressionMethod);
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                ms.Position = 0;
                ds2         = reader.Read();
            }

            Assert.Equal(5, ds2[0].GetInt(0));

            //v3
            //looks like writing is not working in certain scenarios!
            //broken length: 177
            //correct length: 187
            const int value  = 5;
            object    actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod);

            Assert.Equal(5, (int)actual);
        }
Exemple #14
0
        public void Write_int64datetimeoffset()
        {
            var element = new SchemaElement <DateTimeOffset>("timestamp_col");

            /*{
             * ThriftConvertedType = ConvertedType.TIMESTAMP_MILLIS,
             * ThriftOriginalType = Type.INT64
             * };*/

            var ds = new DataSet(
                element
                )
            {
                new DateTimeOffset(new DateTime(2017, 1, 1, 12, 13, 22)),
                new DateTimeOffset(new DateTime(2017, 1, 1, 12, 13, 24))
            };

            //8 values for each column


            var uncompressed = new MemoryStream();

            using (var writer = new ParquetWriter(uncompressed))
            {
                writer.Write(ds, CompressionMethod.None);
            }
        }
        public void List_simple_element_write_read()
        {
            var table = new Table(
                new Schema(
                    new DataField <int>("id"),
                    new ListField("cities",
                                  new DataField <string>("name"))));

            var ms = new MemoryStream();

            table.Add(1, new[] { "London", "Derby" });
            table.Add(2, new[] { "Paris", "New York" });

            //write as table
            using (var writer = new ParquetWriter(table.Schema, ms))
            {
                writer.Write(table);
            }

            //read back into table
            ms.Position = 0;
            Table table2;

            using (var reader = new ParquetReader(ms))
            {
                table2 = reader.ReadAsTable();
            }

            //validate data
            Assert.Equal(table.ToString(), table2.ToString(), ignoreLineEndingDifferences: true);
        }
Exemple #16
0
        public void Write_different_compressions()
        {
            var ds = new DataSet(
                new SchemaElement <int>("id"),
                new SchemaElement <bool>("bool_col"),
                new SchemaElement <string>("string_col")
                )
            {
                //8 values for each column

                { 4, true, "0" },
                { 5, false, "1" },
                { 6, true, "0" },
                { 7, false, "1" },
                { 2, true, "0" },
                { 3, false, "1" },
                { 0, true, "0" },
                { 1, false, "0" }
            };
            var uncompressed = new MemoryStream();

            ParquetWriter.Write(ds, uncompressed, CompressionMethod.None);

            var compressed = new MemoryStream();

            ParquetWriter.Write(ds, compressed, CompressionMethod.Gzip);

            var compressedSnappy = new MemoryStream();

            ParquetWriter.Write(ds, compressedSnappy, CompressionMethod.Snappy);
        }
        private void CompareWithMr(Table t)
        {
            string testFileName = Path.GetFullPath("temp.parquet");

            if (F.Exists(testFileName))
            {
                F.Delete(testFileName);
            }

            //produce file
            using (Stream s = F.OpenWrite(testFileName))
            {
                using (var writer = new ParquetWriter(t.Schema, s))
                {
                    writer.Write(t);
                }
            }

            //read back
            Table t2 = ParquetReader.ReadTableFromFile(testFileName);

            //check we don't have a bug internally before launching MR
            Assert.Equal(t.ToString("j"), t2.ToString("j"), ignoreLineEndingDifferences: true);

            string mrJson = ExecAndGetOutput(_javaExecName, $"-jar {_toolsJarPath} cat -j {testFileName}");

            Assert.Equal(t.ToString("j"), mrJson);
        }
        public override void Close()
        {
            _writer.Write(_ds);
            _writer.Dispose();

            //_tempStream.Position = 0;
            //_tempStream.CopyTo(_resultStream);
        }
Exemple #19
0
        public static DataSet WriteRead(DataSet original, WriterOptions writerOptions = null)
        {
            var ms = new MemoryStream();

            ParquetWriter.Write(original, ms, CompressionMethod.None, null, writerOptions);

            ms.Position = 0;
            return(ParquetReader.Read(ms));
        }
        public static DataSet WriteReadOpt(DataSet original, WriterOptions writerOptions = null)
        {
            var ms = new MemoryStream();

            ParquetWriter.Write(original, ms, CompressionMethod.None, null, writerOptions);
            ms.Flush();
            //System.IO.File.WriteAllBytes("c:\\tmp\\wr.parquet", ms.ToArray());

            ms.Position = 0;
            return(ParquetReader.Read(ms));
        }
        private void FlushDataSet()
        {
            if (_ds == null)
            {
                return;
            }

            _writer.Write(_ds);

            _ds = null;
        }
Exemple #22
0
        public void Reads_created_by_metadata()
        {
            DataSet ds = DataSetGenerator.Generate(10);

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.StartsWith("parquet-dotnet", ds1.Metadata.CreatedBy);
        }
        public void Write_in_small_chunks_to_forward_only_stream()
        {
            var ms          = new MemoryStream();
            var forwardOnly = new WriteableNonSeekableStream(ms);

            var ds = new DataSet(
                new SchemaElement <int>("id"),
                new SchemaElement <string>("nonsense"));

            ds.Add(1, Generator.RandomString);

            using (var writer = new ParquetWriter(forwardOnly))
            {
                writer.Write(ds);
                writer.Write(ds);
                writer.Write(ds);
            }

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(3, ds1.RowCount);
        }
        public void Floats()
        {
            var ds = new DataSet(new SchemaElement <float>("f"));

            ds.Add((float)1.23);

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(ds[0].GetFloat(0), ds1[0].GetFloat(0));
        }
        public void Doubles()
        {
            var ds = new DataSet(new SchemaElement <double>("d"));

            ds.Add((double)12.34);

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(ds[0].GetDouble(0), ds1[0].GetDouble(0));
        }
Exemple #26
0
        public void Read_from_negative_offset_fails()
        {
            DataSet ds = DataSetGenerator.Generate(15);
            var     wo = new WriterOptions {
                RowGroupsSize = 5
            };
            var ro = new ReaderOptions {
                Offset = -4, Count = 2
            };

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms, CompressionMethod.None, null, wo);

            ms.Position = 0;
            Assert.Throws <ParquetException>(() => ParquetReader.Read(ms, null, ro));
        }
Exemple #27
0
        public void Append_to_file_with_different_schema_fails()
        {
            var ms = new MemoryStream();

            var ds1 = new DataSet(new SchemaElement <int>("id"));

            ds1.Add(1);
            ds1.Add(2);
            ParquetWriter.Write(ds1, ms);

            //append to file
            var ds2 = new DataSet(new SchemaElement <double>("id"));

            ds2.Add(3d);
            ds2.Add(4d);
            Assert.Throws <ParquetException>(() => ParquetWriter.Write(ds2, ms, CompressionMethod.Gzip, null, null, true));
        }
Exemple #28
0
        public void Write_different_compressions()
        {
            var ds = new DataSet(
                new SchemaElement <int>("id"),
                new SchemaElement <bool>("bool_col"),
                new SchemaElement <string>("string_col")
                );

            //8 values for each column

            ds.Add(4, true, "0");
            ds.Add(5, false, "1");
            ds.Add(6, true, "0");
            ds.Add(7, false, "1");
            ds.Add(2, true, "0");
            ds.Add(3, false, "1");
            ds.Add(0, true, "0");
            ds.Add(1, false, "0");

            var uncompressed = new MemoryStream();

            using (var writer = new ParquetWriter(uncompressed))
            {
                writer.Write(ds, CompressionMethod.None);
            }

            var compressed = new MemoryStream();

            using (var writer = new ParquetWriter(compressed))
            {
                writer.Write(ds, CompressionMethod.Gzip);
            }

            var compressedSnappy = new MemoryStream();

            using (var writer = new ParquetWriter(compressedSnappy))
            {
                writer.Write(ds, CompressionMethod.Snappy);
            }

#if DEBUG
            const string path = "c:\\tmp\\first.parquet";
            F.WriteAllBytes(path, uncompressed.ToArray());
#endif
        }
Exemple #29
0
        public void Type_write_byte_and_short_byte()
        {
            var schema = new Schema(new SchemaElement <sbyte>("sbyte"), new SchemaElement <byte>("byte"));
            var ds     = new DataSet(schema)
            {
                { (sbyte)121, (byte)122 }
            };

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(121, (sbyte)ds1[0][0]);
            Assert.Equal(122, (byte)ds1[0][1]);
        }
 /// <summary>
 ///		Graba el archivo
 /// </summary>
 protected override void SaveFile(LibLogger.Models.Log.BlockLogModel block, string fileName)
 {
     // Graba el archivo
     using (CsvReader reader = new CsvReader(FileName, FileParameters, FileColumns))
     {
         using (ParquetWriter writer = new ParquetWriter(fileName))
         {
             // Log
             writer.Progress += (sender, args) => block.Progress(System.IO.Path.GetFileName(fileName), args.Records, args.Records + 1);
             // Escribe el archivo
             writer.Write(reader);
         }
     }
     // Log
     block.Progress(System.IO.Path.GetFileName(fileName), 0, 0);
     block.Info($"Fin de la grabación del archivo '{fileName}'");
     SolutionViewModel.MainController.Logger.Flush();
 }