private static void Debug() { //GCSettings.LatencyMode = GCLatencyMode.LowLatency; DataSet ds = ParquetReader.ReadFile("c:\\tmp\\customer.impala.parquet", new ParquetOptions { TreatByteArrayAsString = true }); }
public void Read_simple_nested_field() { /* * root |-- city: struct (nullable = true) | |-- country: string (nullable = true) | |-- isCapital: boolean (nullable = true) | |-- name: string (nullable = true) |-- id: long (nullable = true) */ DataSet ds = ParquetReader.ReadFile(GetDataFilePath("simplenested.parquet")); Assert.Equal(1, ds.RowCount); Assert.Equal(2, ds.ColumnCount); Assert.Equal(typeof(Row), ds.Schema[0].ElementType); Assert.Equal(typeof(long), ds.Schema[1].ElementType); Assert.Equal("city", ds.Schema.ColumnNames[0]); Assert.Equal("id", ds.Schema.ColumnNames[1]); Row mr = ds[0]; Row city = mr.Get <Row>(0); Assert.Equal(city[0], "United Kingdom"); Assert.Equal(city[1], true); Assert.Equal(city[2], "London"); Assert.Equal(1L, mr[1]); }
private static void Debug() { //GCSettings.LatencyMode = GCLatencyMode.LowLatency; DataSet ds = ParquetReader.ReadFile(@"C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet", new ParquetOptions { TreatByteArrayAsString = true }); }
public static DataSet ReadFromParquetFile(string path, out long fileLen) { var fileInfo = new System.IO.FileInfo(path); fileLen = fileInfo.Length; return(ParquetReader.ReadFile(path)); }
public void Reads_repeated_struct() { DataSet ds = ParquetReader.ReadFile(GetDataFilePath("repeatedstruct.parquet")); Assert.Equal("{[{UK;London};{US;New York}];1}", ds[0].ToString()); //DataSet ds1 = ParquetReader.ReadFile("c:\\tmp\\rep.parquet"); }
public void List_of_elements_with_some_items_empty_reads_file() { DataSet ds = ParquetReader.ReadFile(GetDataFilePath("listofitems-empty-alternates.parquet")); Assert.Equal(4, ds.RowCount); Assert.Equal("{1;[1;2;3]}", ds[0].ToString()); Assert.Equal("{2;[]}", ds[1].ToString()); Assert.Equal("{3;[1;2;3]}", ds[2].ToString()); Assert.Equal("{4;[]}", ds[3].ToString()); }
private static void Perf() { var readTimes = new List <TimeSpan>(); var writeUncompressedTimes = new List <TimeSpan>(); var writeGzipTimes = new List <TimeSpan>(); var writeSnappyTimes = new List <TimeSpan>(); for (int i = 0; i < 4; i++) { DataSet ds; using (var time = new TimeMeasure()) { ds = ParquetReader.ReadFile("C:\\tmp\\customer.impala.parquet"); TimeSpan elapsed = time.Elapsed; readTimes.Add(elapsed); log.Trace("read in {0}", elapsed); } /*string dest = "c:\\tmp\\write.test.parquet"; * if (F.Exists(dest)) F.Delete(dest); * * using (var time = new TimeMeasure()) * { * ParquetWriter.WriteFile(ds, dest, CompressionMethod.None); * writeUncompressedTimes.Add(time.Elapsed); * } * * using (var time = new TimeMeasure()) * { * ParquetWriter.WriteFile(ds, dest, CompressionMethod.Gzip); * writeGzipTimes.Add(time.Elapsed); * } * * using (var time = new TimeMeasure()) * { * ParquetWriter.WriteFile(ds, dest, CompressionMethod.Snappy); * writeSnappyTimes.Add(time.Elapsed); * }*/ log.Trace("run finished: {0}", i); } double avgRead = readTimes.Skip(1).Average(t => t.TotalMilliseconds); log.Trace("avg: {0}", avgRead); /*double avgUncompressed = writeUncompressedTimes.Skip(1).Average(t => t.TotalMilliseconds); * double avgGzip = writeGzipTimes.Skip(1).Average(t => t.TotalMilliseconds); * double avgSnappy = writeUncompressedTimes.Skip(1).Average(t => t.TotalMilliseconds); * * log.Trace("averages => read: {0}, uncompressed: {1}, gzip: {2}, snappy: {3}", avgRead, avgUncompressed, avgGzip, avgSnappy);*/ }
public void Read_simple_map() { DataSet ds = ParquetReader.ReadFile(GetDataFilePath("map.parquet")); SchemaElement ms = ds.Schema[1]; Assert.Equal("numbers", ms.Name); Assert.Equal(1, ms.Extra[0].MaxRepetitionLevel); Assert.Equal(2, ms.Extra[0].MaxDefinitionLevel); Assert.Equal(1, ms.Extra[1].MaxRepetitionLevel); Assert.Equal(3, ms.Extra[1].MaxDefinitionLevel); Assert.Equal("{1;[1=>one;2=>two;3=>three]}", ds[0].ToString()); //DataSet ds2 = DataSetGenerator.WriteRead(ds); //ParquetWriter.WriteFile(ds, "c:\\tmp\\pmaps.parquet", CompressionMethod.None); }
static void Main(string[] args) { L.Config .WriteTo.PoshConsole(); DataSet ds; using (var time = new TimeMeasure()) { ds = ParquetReader.ReadFile("C:\\tmp\\postcodes.plain.parquet"); Console.WriteLine("read in {0}", time.Elapsed); } Console.WriteLine("has {0} rows", ds.RowCount); //postcodes.plain.parquet - 137Mb //debug: 26 seconds. //release: 25 seconds. }
public void Reads_really_mad_nested_file() { /* Spark schema: * root |-- addresses: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- line1: string (nullable = true) | | |-- name: string (nullable = true) | | |-- openingHours: array (nullable = true) | | | |-- element: long (containsNull = true) | | |-- postcode: string (nullable = true) |-- cities: array (nullable = true) | |-- element: string (containsNull = true) |-- comment: string (nullable = true) |-- id: long (nullable = true) |-- location: struct (nullable = true) | |-- latitude: double (nullable = true) | |-- longitude: double (nullable = true) |-- price: struct (nullable = true) | |-- lunch: struct (nullable = true) | | |-- max: long (nullable = true) | | |-- min: long (nullable = true) */ Assert.Throws <NotSupportedException>(() => ParquetReader.ReadFile(GetDataFilePath("nested.parquet"))); //DataSet ds = ParquetReader.ReadFile(GetDataFilePath("nested.parquet")); //Assert.Equal(2, ds.Count); //Assert.Equal(6, ds.Schema.Length); /*Assert.Equal(typeof(string), ds.Schema[0].ElementType); * Assert.Equal(typeof(long), ds.Schema[1].ElementType); * Assert.Equal(typeof(Row), ds.Schema[2].ElementType); * Assert.Equal(typeof(long), ds.Schema[3].ElementType); * Assert.Equal(typeof(Row), ds.Schema[4].ElementType);*/ }
public void List_of_elements_is_empty_reads_file() { /* * This is a tricky one, as there are actually no elements in the second column, here is a dump of it: * * * repeats1.list.element TV=1 RL=1 DL=3 * ---------------------------------------------------------------------------- * page 0: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:13 VC:1 * * BINARY repeats1.list.element * -------------------------------------------------------------------------------- *** row group 1 of 1, values 1 to 1 *** ***value 1: R:0 D:1 V:<null> * * The dump shows there is actually one value, but with DL=1, whereas column's DL is 3. That means the list is created on level 1 * (repeats entry level). */ DataSet ds = ParquetReader.ReadFile(GetDataFilePath("listofitems-empty-onerow.parquet")); Assert.Equal("{2;[]}", ds[0].ToString()); Assert.Equal(1, ds.RowCount); }