Ejemplo n.º 1
0
 private static void Debug()
 {
     //GCSettings.LatencyMode = GCLatencyMode.LowLatency;
     DataSet ds = ParquetReader.ReadFile("c:\\tmp\\customer.impala.parquet", new ParquetOptions {
         TreatByteArrayAsString = true
     });
 }
Ejemplo n.º 2
0
        public void Read_simple_nested_field()
        {
            /*
             * root
             |-- city: struct (nullable = true)
             |    |-- country: string (nullable = true)
             |    |-- isCapital: boolean (nullable = true)
             |    |-- name: string (nullable = true)
             |-- id: long (nullable = true)
             */

            DataSet ds = ParquetReader.ReadFile(GetDataFilePath("simplenested.parquet"));

            Assert.Equal(1, ds.RowCount);
            Assert.Equal(2, ds.ColumnCount);

            Assert.Equal(typeof(Row), ds.Schema[0].ElementType);
            Assert.Equal(typeof(long), ds.Schema[1].ElementType);

            Assert.Equal("city", ds.Schema.ColumnNames[0]);
            Assert.Equal("id", ds.Schema.ColumnNames[1]);

            Row mr = ds[0];

            Row city = mr.Get <Row>(0);

            Assert.Equal(city[0], "United Kingdom");
            Assert.Equal(city[1], true);
            Assert.Equal(city[2], "London");

            Assert.Equal(1L, mr[1]);
        }
Ejemplo n.º 3
0
 private static void Debug()
 {
     //GCSettings.LatencyMode = GCLatencyMode.LowLatency;
     DataSet ds = ParquetReader.ReadFile(@"C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet", new ParquetOptions {
         TreatByteArrayAsString = true
     });
 }
Ejemplo n.º 4
0
        public static DataSet ReadFromParquetFile(string path, out long fileLen)
        {
            var fileInfo = new System.IO.FileInfo(path);

            fileLen = fileInfo.Length;
            return(ParquetReader.ReadFile(path));
        }
Ejemplo n.º 5
0
        public void Reads_repeated_struct()
        {
            DataSet ds = ParquetReader.ReadFile(GetDataFilePath("repeatedstruct.parquet"));

            Assert.Equal("{[{UK;London};{US;New York}];1}", ds[0].ToString());

            //DataSet ds1 = ParquetReader.ReadFile("c:\\tmp\\rep.parquet");
        }
Ejemplo n.º 6
0
        public void List_of_elements_with_some_items_empty_reads_file()
        {
            DataSet ds = ParquetReader.ReadFile(GetDataFilePath("listofitems-empty-alternates.parquet"));

            Assert.Equal(4, ds.RowCount);
            Assert.Equal("{1;[1;2;3]}", ds[0].ToString());
            Assert.Equal("{2;[]}", ds[1].ToString());
            Assert.Equal("{3;[1;2;3]}", ds[2].ToString());
            Assert.Equal("{4;[]}", ds[3].ToString());
        }
Ejemplo n.º 7
0
        private static void Perf()
        {
            var readTimes = new List <TimeSpan>();
            var writeUncompressedTimes = new List <TimeSpan>();
            var writeGzipTimes         = new List <TimeSpan>();
            var writeSnappyTimes       = new List <TimeSpan>();

            for (int i = 0; i < 4; i++)
            {
                DataSet ds;

                using (var time = new TimeMeasure())
                {
                    ds = ParquetReader.ReadFile("C:\\tmp\\customer.impala.parquet");
                    TimeSpan elapsed = time.Elapsed;
                    readTimes.Add(elapsed);
                    log.Trace("read in {0}", elapsed);
                }

                /*string dest = "c:\\tmp\\write.test.parquet";
                 * if (F.Exists(dest)) F.Delete(dest);
                 *
                 * using (var time = new TimeMeasure())
                 * {
                 * ParquetWriter.WriteFile(ds, dest, CompressionMethod.None);
                 * writeUncompressedTimes.Add(time.Elapsed);
                 * }
                 *
                 * using (var time = new TimeMeasure())
                 * {
                 * ParquetWriter.WriteFile(ds, dest, CompressionMethod.Gzip);
                 * writeGzipTimes.Add(time.Elapsed);
                 * }
                 *
                 * using (var time = new TimeMeasure())
                 * {
                 * ParquetWriter.WriteFile(ds, dest, CompressionMethod.Snappy);
                 * writeSnappyTimes.Add(time.Elapsed);
                 * }*/

                log.Trace("run finished: {0}", i);
            }

            double avgRead = readTimes.Skip(1).Average(t => t.TotalMilliseconds);

            log.Trace("avg: {0}", avgRead);

            /*double avgUncompressed = writeUncompressedTimes.Skip(1).Average(t => t.TotalMilliseconds);
             * double avgGzip = writeGzipTimes.Skip(1).Average(t => t.TotalMilliseconds);
             * double avgSnappy = writeUncompressedTimes.Skip(1).Average(t => t.TotalMilliseconds);
             *
             * log.Trace("averages => read: {0}, uncompressed: {1}, gzip: {2}, snappy: {3}", avgRead, avgUncompressed, avgGzip, avgSnappy);*/
        }
Ejemplo n.º 8
0
        public void Read_simple_map()
        {
            DataSet ds = ParquetReader.ReadFile(GetDataFilePath("map.parquet"));

            SchemaElement ms = ds.Schema[1];

            Assert.Equal("numbers", ms.Name);

            Assert.Equal(1, ms.Extra[0].MaxRepetitionLevel);
            Assert.Equal(2, ms.Extra[0].MaxDefinitionLevel);

            Assert.Equal(1, ms.Extra[1].MaxRepetitionLevel);
            Assert.Equal(3, ms.Extra[1].MaxDefinitionLevel);

            Assert.Equal("{1;[1=>one;2=>two;3=>three]}", ds[0].ToString());

            //DataSet ds2 = DataSetGenerator.WriteRead(ds);
            //ParquetWriter.WriteFile(ds, "c:\\tmp\\pmaps.parquet", CompressionMethod.None);
        }
Ejemplo n.º 9
0
        static void Main(string[] args)
        {
            L.Config
            .WriteTo.PoshConsole();

            DataSet ds;

            using (var time = new TimeMeasure())
            {
                ds = ParquetReader.ReadFile("C:\\tmp\\postcodes.plain.parquet");

                Console.WriteLine("read in {0}", time.Elapsed);
            }

            Console.WriteLine("has {0} rows", ds.RowCount);

            //postcodes.plain.parquet - 137Mb
            //debug: 26 seconds.
            //release: 25 seconds.
        }
Ejemplo n.º 10
0
        public void Reads_really_mad_nested_file()
        {
            /* Spark schema:
             * root
             |-- addresses: array (nullable = true)
             |    |-- element: struct (containsNull = true)
             |    |    |-- line1: string (nullable = true)
             |    |    |-- name: string (nullable = true)
             |    |    |-- openingHours: array (nullable = true)
             |    |    |    |-- element: long (containsNull = true)
             |    |    |-- postcode: string (nullable = true)
             |-- cities: array (nullable = true)
             |    |-- element: string (containsNull = true)
             |-- comment: string (nullable = true)
             |-- id: long (nullable = true)
             |-- location: struct (nullable = true)
             |    |-- latitude: double (nullable = true)
             |    |-- longitude: double (nullable = true)
             |-- price: struct (nullable = true)
             |    |-- lunch: struct (nullable = true)
             |    |    |-- max: long (nullable = true)
             |    |    |-- min: long (nullable = true)
             */


            Assert.Throws <NotSupportedException>(() => ParquetReader.ReadFile(GetDataFilePath("nested.parquet")));

            //DataSet ds = ParquetReader.ReadFile(GetDataFilePath("nested.parquet"));

            //Assert.Equal(2, ds.Count);
            //Assert.Equal(6, ds.Schema.Length);

            /*Assert.Equal(typeof(string), ds.Schema[0].ElementType);
             * Assert.Equal(typeof(long), ds.Schema[1].ElementType);
             * Assert.Equal(typeof(Row), ds.Schema[2].ElementType);
             * Assert.Equal(typeof(long), ds.Schema[3].ElementType);
             * Assert.Equal(typeof(Row), ds.Schema[4].ElementType);*/
        }
Ejemplo n.º 11
0
        public void List_of_elements_is_empty_reads_file()
        {
            /*
             * This is a tricky one, as there are actually no elements in the second column, here is a dump of it:
             *
             *
             * repeats1.list.element TV=1 RL=1 DL=3
             * ----------------------------------------------------------------------------
             * page 0:  DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:13 VC:1
             *
             * BINARY repeats1.list.element
             * --------------------------------------------------------------------------------
             *** row group 1 of 1, values 1 to 1 ***
             ***value 1: R:0 D:1 V:<null>
             *
             *  The dump shows there is actually one value, but with DL=1, whereas column's DL is 3. That means the list is created on level 1
             *  (repeats entry level).
             */

            DataSet ds = ParquetReader.ReadFile(GetDataFilePath("listofitems-empty-onerow.parquet"));

            Assert.Equal("{2;[]}", ds[0].ToString());
            Assert.Equal(1, ds.RowCount);
        }