Пример #1
0
        public void Type_writes_and_reads_end_to_end(SchemaElement schema, object value, string name = null)
        {
            var ds = new DataSet(schema)
            {
                new Row(value)
            };
            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            object expectedValue = ds[0][0];
            object actualValue   = ds1[0][0];

            if (schema.ElementType == typeof(DateTime))
            {
                actualValue = ((DateTimeOffset)actualValue).DateTime;
            }

            Assert.True(expectedValue.Equals(actualValue),
                        $"{name}| expected: {expectedValue}, actual: {actualValue}, schema element: {schema}");

            //if (schema.ElementType == typeof(decimal)) ParquetWriter.WriteFile(ds1, "c:\\tmp\\decimals.parquet");
        }
        public void All_compression_methods_supported(CompressionMethod compressionMethod)
        {
            //v2
            var     ms  = new MemoryStream();
            DataSet ds1 = new DataSet(new DataField <int>("id"));
            DataSet ds2;

            ds1.Add(5);

            //write
            using (var writer = new ParquetWriter(ms))
            {
                writer.Write(ds1, compressionMethod);
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                ms.Position = 0;
                ds2         = reader.Read();
            }

            Assert.Equal(5, ds2[0].GetInt(0));

            //v3
            //looks like writing is not working in certain scenarios!
            //broken length: 177
            //correct length: 187
            const int value  = 5;
            object    actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod);

            Assert.Equal(5, (int)actual);
        }
Пример #3
0
        public static async Task <DataSet> LoadAsync(StorageFile file, int offset = 0, int count = 100)
        {
            using (IRandomAccessStreamWithContentType uwpStream = await file.OpenReadAsync())
            {
                using (Stream stream = uwpStream.AsStreamForRead())
                {
                    var readerOptions = new ReaderOptions()
                    {
                        Offset = offset,
                        Count  = count
                    };

                    var formatOptions = new ParquetOptions
                    {
                        TreatByteArrayAsString = true
                    };

                    try
                    {
                        return(ParquetReader.Read(stream, formatOptions, readerOptions));
                    }
                    catch (Exception ex)
                    {
                        var dialog = new MessageDialog(ex.Message, "Cannot open file");
                        await dialog.ShowAsync();

                        return(null);
                    }
                }
            }
        }
Пример #4
0
        public void Write_and_read_nullable_integers()
        {
            var ds = new DataSet(new DataField <int?>("id"))
            {
                1,
                2,
                3,
                (object)null,
                4,
                (object)null,
                5
            };
            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(1, ds1[0].GetInt(0));
            Assert.Equal(2, ds1[1].GetInt(0));
            Assert.Equal(3, ds1[2].GetInt(0));
            Assert.True(ds1[3].IsNullAt(0));
            Assert.Equal(4, ds1[4].GetInt(0));
            Assert.True(ds1[5].IsNullAt(0));
            Assert.Equal(5, ds1[6].GetInt(0));
        }
Пример #5
0
        public void All_compression_methods_supported(CompressionMethod compressionMethod)
        {
            //v2
            var     ms  = new MemoryStream();
            DataSet ds1 = new DataSet(new DataField <int>("id"));
            DataSet ds2;

            ds1.Add(5);

            //write
            using (var writer = new ParquetWriter(ms))
            {
                writer.Write(ds1, CompressionMethod.Gzip);
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                ms.Position = 0;
                ds2         = reader.Read();
            }

            Assert.Equal(5, ds2[0].GetInt(0));

            //v3
            const int value  = 5;
            object    actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod);

            Assert.Equal(5, (int)actual);
        }
Пример #6
0
        public void Write_and_read_nullable_integers()
        {
            var ds = new DataSet(new SchemaElement <int>("id"))
            {
                1,
                2,
                3,
                (object)null,
                4,
                (object)null,
                5
            };
            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(ds1[0].GetInt(0), 1);
            Assert.Equal(ds1[1].GetInt(0), 2);
            Assert.Equal(ds1[2].GetInt(0), 3);
            Assert.True(ds1[3].IsNullAt(0));
            Assert.Equal(ds1[4].GetInt(0), 4);
            Assert.True(ds1[5].IsNullAt(0));
            Assert.Equal(ds1[6].GetInt(0), 5);
        }
Пример #7
0
      public void I_can_write_snappy_and_read_back()
      {
         var ms = new MemoryStream();
         var ds1 = new DataSet(
            new DataField<int>("id"),
            new DataField<int>("no"));

         ds1.Add(1, 3);
         ds1.Add(2, 4);

         DataSet ds2;

         //write
         using (var writer = new ParquetWriter(ms))
         {
            writer.Write(ds1, CompressionMethod.Snappy);
         }

         //read back
         using (var reader = new ParquetReader(ms))
         {
            ms.Position = 0;
            ds2 = reader.Read();
         }

         Assert.Equal(1, ds2[0].GetInt(0));
         Assert.Equal(2, ds2[1].GetInt(0));
         Assert.Equal(3, ds2[0].GetInt(1));
         Assert.Equal(4, ds2[1].GetInt(1));
      }
Пример #8
0
            protected override bool MoveNextCore()
            {
                if (_dataSetEnumerator.MoveNext())
                {
                    _curDataSetRow = _dataSetEnumerator.Current;
                    return(true);
                }
                else if (_blockEnumerator.MoveNext())
                {
                    _readerOptions.Offset = (long)_blockEnumerator.Current * _readerOptions.Count;

                    // When current dataset runs out, read the next portion of the parquet file.
                    DataSet ds;
                    lock (_loader._parquetStream)
                    {
                        ds = ParquetReader.Read(_loader._parquetStream, _loader._parquetOptions, _readerOptions);
                    }

                    var dataSetOrder = CreateOrderSequence(ds.RowCount);
                    _dataSetEnumerator = dataSetOrder.GetEnumerator();
                    _curDataSetRow     = dataSetOrder.ElementAt(0);

                    // Cache list for each active column
                    for (int i = 0; i < _actives.Length; i++)
                    {
                        Column col = _loader._columnsLoaded[_actives[i]];
                        _columnValues[i] = ds.GetColumn(col.DataField);
                    }

                    return(_dataSetEnumerator.MoveNext());
                }
                return(false);
            }
Пример #9
0
        public void Reads_really_mad_nested_file()
        {
            /* Spark schema:
             * root
             |-- addresses: array (nullable = true)
             |    |-- element: struct (containsNull = true)
             |    |    |-- line1: string (nullable = true)
             |    |    |-- name: string (nullable = true)
             |    |    |-- openingHours: array (nullable = true)
             |    |    |    |-- element: long (containsNull = true)
             |    |    |-- postcode: string (nullable = true)
             |-- cities: array (nullable = true)
             |    |-- element: string (containsNull = true)
             |-- comment: string (nullable = true)
             |-- id: long (nullable = true)
             |-- location: struct (nullable = true)
             |    |-- latitude: double (nullable = true)
             |    |-- longitude: double (nullable = true)
             |-- price: struct (nullable = true)
             |    |-- lunch: struct (nullable = true)
             |    |    |-- max: long (nullable = true)
             |    |    |-- min: long (nullable = true)
             */


            DataSet ds = ParquetReader.Read(OpenTestFile("nested.parquet"));

            //much easier to compare mad nestness with .ToString(), but will break when it changes
            Assert.Equal("{[{Dante Road;Head Office;[9;10;11;12;13;14;15;16;17;18];SE11};{Somewhere Else;Small Office;[6;7;19;20;21;22;23];TN19}];[London;Derby];this file contains all the permunations for nested structures and arrays to test Parquet parser;1;{51.2;66.3};{{2;1}}}", ds[0].ToString());
            Assert.Equal("{[{Dante Road;Head Office;[9;10;11;12;13;14;15;16;17;18];SE11};{Somewhere Else;Small Office;[6;7;19;20;21;22;23];TN19}];[London;Derby];this file contains all the permunations for nested structures and arrays to test Parquet parser;1;{51.2;66.3};{{2;1}}}", ds[1].ToString());
        }
Пример #10
0
            protected override bool MoveNextCore()
            {
                if (_dataSetEnumerator.MoveNext())
                {
                    _curDataSetRow = (int)_dataSetEnumerator.Current;
                    return(true);
                }
                else if (_blockEnumerator.MoveNext())
                {
                    _readerOptions.Offset = (int)_blockEnumerator.Current * _readerOptions.Count;

                    // When current dataset runs out, read the next portion of the parquet file.
                    DataSet ds;
                    lock (_loader._parquetStream)
                    {
                        ds = ParquetReader.Read(_loader._parquetStream, _loader._parquetOptions, _readerOptions);
                    }

                    int[] dataSetOrder = _rand == null?Utils.GetIdentityPermutation(ds.RowCount) : Utils.GetRandomPermutation(_rand, ds.RowCount);

                    _dataSetEnumerator = dataSetOrder.GetEnumerator();
                    _curDataSetRow     = dataSetOrder[0];

                    // Cache list for each active column
                    for (int i = 0; i < _actives.Length; i++)
                    {
                        Column col = _loader._columnsLoaded[_actives[i]];
                        _columnValues[i] = ds.GetColumn(col.DataField);
                    }

                    return(_dataSetEnumerator.MoveNext());
                }
                return(false);
            }
Пример #11
0
        public static DataSet WriteRead(DataSet original, WriterOptions writerOptions = null)
        {
            var ms = new MemoryStream();

            ParquetWriter.Write(original, ms, CompressionMethod.None, null, writerOptions);

            ms.Position = 0;
            return(ParquetReader.Read(ms));
        }
Пример #12
0
 private DataSet ReadParquet(string name)
 {
     using (Stream s = OpenTestFile(name))
     {
         return(ParquetReader.Read(s, new ParquetOptions {
             TreatByteArrayAsString = true
         }));
     }
 }
Пример #13
0
        public void Read_all_nulls_file()
        {
            DataSet ds = ParquetReader.Read(OpenTestFile("all_nulls.parquet"));

            Assert.Equal(1, ds.Schema.Length);
            Assert.Equal("lognumber", ds.Schema[0].Name);
            Assert.Equal(1, ds.RowCount);
            Assert.Null(ds[0][0]);
        }
Пример #14
0
        public void Read_simple_map()
        {
            DataSet ds = ParquetReader.Read(OpenTestFile("map.parquet"));

            Field ms = ds.Schema[1];

            Assert.Equal("numbers", ms.Name);

            Assert.Equal("{1;[1=>one;2=>two;3=>three]}", ds[0].ToString());
        }
 public void FixedLenByteArray_dictionary()
 {
     using (Stream s = F.OpenRead(GetDataFilePath("fixedlenbytearray.parquet")))
     {
         using (var r = new ParquetReader(s))
         {
             DataSet ds = r.Read();
         }
     }
 }
 public void Datetypes_all()
 {
     using (Stream s = F.OpenRead(GetDataFilePath("dates.parquet")))
     {
         using (var r = new ParquetReader(s))
         {
             DataSet ds = r.Read();
         }
     }
 }
Пример #17
0
        public void List_of_elements_with_some_items_empty_reads_file()
        {
            DataSet ds = ParquetReader.Read(OpenTestFile("listofitems-empty-alternates.parquet"));

            Assert.Equal(4, ds.RowCount);
            Assert.Equal("{1;[1;2;3]}", ds[0].ToString());
            Assert.Equal("{2;[]}", ds[1].ToString());
            Assert.Equal("{3;[1;2;3]}", ds[2].ToString());
            Assert.Equal("{4;[]}", ds[3].ToString());
        }
Пример #18
0
        public static DataSet WriteReadOpt(DataSet original, WriterOptions writerOptions = null)
        {
            var ms = new MemoryStream();

            ParquetWriter.Write(original, ms, CompressionMethod.None, null, writerOptions);
            ms.Flush();
            //System.IO.File.WriteAllBytes("c:\\tmp\\wr.parquet", ms.ToArray());

            ms.Position = 0;
            return(ParquetReader.Read(ms));
        }
Пример #19
0
        public void Read_only_limited_columns()
        {
            var options = new ReaderOptions
            {
                Columns = new[] { "n_name", "n_regionkey" }
            };

            DataSet ds = ParquetReader.Read(OpenTestFile("nation.impala.parquet"), null, options);

            Assert.Equal(2, ds.FieldCount);
        }
Пример #20
0
        public void Read_all_legacy_decimals()
        {
            DataSet ds = ParquetReader.Read(OpenTestFile("decimallegacy.parquet"));

            Row row = ds[0];

            Assert.Equal(1, (int)row[0]);
            Assert.Equal(1.2m, (decimal)row[1], 2);
            Assert.Null(row[2]);
            Assert.Equal(-1m, (decimal)row[3], 2);
        }
Пример #21
0
        public void Reads_compat_customer_impala_file()
        {
            /*
             * c_name:
             *    45 pages (0-44)
             */

            DataSet customer = ParquetReader.Read(OpenTestFile("customer.impala.parquet"));

            Assert.Equal(150000, customer.RowCount);
        }
Пример #22
0
        protected override DataTable ReadRecordsFrom(int lowerPageBoundary, int rowsPerPage)
        {
            //var records = ParquetReader.Read(fileStream, options, readOptions);

            var records = dataFileReader.Read();

            _totalRowCount = (int)records.TotalRowCount;

            var dataTable = convertToDataTable(records);

            return(dataTable);
        }
Пример #23
0
        public void Reads_created_by_metadata()
        {
            DataSet ds = DataSetGenerator.Generate(10);

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.StartsWith("parquet-dotnet", ds1.Metadata.CreatedBy);
        }
        public void Floats()
        {
            var ds = new DataSet(new SchemaElement <float>("f"));

            ds.Add((float)1.23);

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(ds[0].GetFloat(0), ds1[0].GetFloat(0));
        }
        public void Doubles()
        {
            var ds = new DataSet(new SchemaElement <double>("d"));

            ds.Add((double)12.34);

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(ds[0].GetDouble(0), ds1[0].GetDouble(0));
        }
Пример #26
0
        public void Datetypes_all()
        {
            DateTimeOffset offset, offset2;

            using (Stream s = OpenTestFile("dates.parquet"))
            {
                using (var r = new ParquetReader(s))
                {
                    DataSet ds = r.Read();
                    offset  = (DateTimeOffset)ds[0][1];
                    offset2 = (DateTimeOffset)ds[1][1];
                }
            }
            Assert.Equal(new DateTime(2017, 1, 1), offset.Date);
            Assert.Equal(new DateTime(2017, 2, 1), offset2.Date);
        }
Пример #27
0
        public void Read_from_negative_offset_fails()
        {
            DataSet ds = DataSetGenerator.Generate(15);
            var     wo = new WriterOptions {
                RowGroupsSize = 5
            };
            var ro = new ReaderOptions {
                Offset = -4, Count = 2
            };

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms, CompressionMethod.None, null, wo);

            ms.Position = 0;
            Assert.Throws <ParquetException>(() => ParquetReader.Read(ms, null, ro));
        }
Пример #28
0
        public void Type_write_byte_and_short_byte()
        {
            var schema = new Schema(new SchemaElement <sbyte>("sbyte"), new SchemaElement <byte>("byte"));
            var ds     = new DataSet(schema)
            {
                { (sbyte)121, (byte)122 }
            };

            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(121, (sbyte)ds1[0][0]);
            Assert.Equal(122, (byte)ds1[0][1]);
        }
Пример #29
0
        private ParquetLoader(Arguments args, IHost host, Stream stream)
        {
            Contracts.AssertValue(host, nameof(host));
            _host = host;

            _host.CheckValue(args, nameof(args));
            _host.CheckValue(stream, nameof(stream));
            _host.CheckParam(stream.CanRead, nameof(stream), "input stream must be readable");
            _host.CheckParam(stream.CanSeek, nameof(stream), "input stream must be seekable");
            _host.CheckParam(stream.Position == 0, nameof(stream), "input stream must be at head");

            using (var ch = _host.Start("Initializing host"))
            {
                _parquetStream  = stream;
                _parquetOptions = new ParquetOptions()
                {
                    TreatByteArrayAsString  = true,
                    TreatBigIntegersAsDates = args.TreatBigIntegersAsDates
                };

                DataSet schemaDataSet;

                try
                {
                    // We only care about the schema so ignore the rows.
                    ReaderOptions readerOptions = new ReaderOptions()
                    {
                        Count  = 0,
                        Offset = 0
                    };
                    schemaDataSet = ParquetReader.Read(stream, _parquetOptions, readerOptions);
                    _rowCount     = schemaDataSet.TotalRowCount;
                }
                catch (Exception ex)
                {
                    throw new InvalidDataException("Cannot read Parquet file", ex);
                }

                _columnChunkReadSize = args.ColumnChunkReadSize;
                _columnsLoaded       = InitColumns(schemaDataSet);
                Schema = CreateSchema(_host, _columnsLoaded);
            }
        }
Пример #30
0
        private void Read(IUnstructuredReader reader)
        {
            //i'm not sure how to read this any other way as Parquet needs seekable stream
            using (var ms = new MemoryStream())
            {
                reader.BaseStream.CopyTo(ms);
                ms.Position = 0;
                _parquet    = ParquetReader.Read(ms, new ParquetOptions()
                {
                    TreatByteArrayAsString = true
                });
            }

            _columnNameToIndex.Clear();
            for (int i = 0; i < _parquet.Schema.Length; i++)
            {
                _columnNameToIndex[_parquet.Schema[i].Name] = i;
            }
        }