public void Type_writes_and_reads_end_to_end(SchemaElement schema, object value, string name = null) { var ds = new DataSet(schema) { new Row(value) }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); object expectedValue = ds[0][0]; object actualValue = ds1[0][0]; if (schema.ElementType == typeof(DateTime)) { actualValue = ((DateTimeOffset)actualValue).DateTime; } Assert.True(expectedValue.Equals(actualValue), $"{name}| expected: {expectedValue}, actual: {actualValue}, schema element: {schema}"); //if (schema.ElementType == typeof(decimal)) ParquetWriter.WriteFile(ds1, "c:\\tmp\\decimals.parquet"); }
public void All_compression_methods_supported(CompressionMethod compressionMethod) { //v2 var ms = new MemoryStream(); DataSet ds1 = new DataSet(new DataField <int>("id")); DataSet ds2; ds1.Add(5); //write using (var writer = new ParquetWriter(ms)) { writer.Write(ds1, compressionMethod); } //read back using (var reader = new ParquetReader(ms)) { ms.Position = 0; ds2 = reader.Read(); } Assert.Equal(5, ds2[0].GetInt(0)); //v3 //looks like writing is not working in certain scenarios! //broken length: 177 //correct length: 187 const int value = 5; object actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod); Assert.Equal(5, (int)actual); }
public static async Task <DataSet> LoadAsync(StorageFile file, int offset = 0, int count = 100) { using (IRandomAccessStreamWithContentType uwpStream = await file.OpenReadAsync()) { using (Stream stream = uwpStream.AsStreamForRead()) { var readerOptions = new ReaderOptions() { Offset = offset, Count = count }; var formatOptions = new ParquetOptions { TreatByteArrayAsString = true }; try { return(ParquetReader.Read(stream, formatOptions, readerOptions)); } catch (Exception ex) { var dialog = new MessageDialog(ex.Message, "Cannot open file"); await dialog.ShowAsync(); return(null); } } } }
public void Write_and_read_nullable_integers() { var ds = new DataSet(new DataField <int?>("id")) { 1, 2, 3, (object)null, 4, (object)null, 5 }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(1, ds1[0].GetInt(0)); Assert.Equal(2, ds1[1].GetInt(0)); Assert.Equal(3, ds1[2].GetInt(0)); Assert.True(ds1[3].IsNullAt(0)); Assert.Equal(4, ds1[4].GetInt(0)); Assert.True(ds1[5].IsNullAt(0)); Assert.Equal(5, ds1[6].GetInt(0)); }
public void All_compression_methods_supported(CompressionMethod compressionMethod) { //v2 var ms = new MemoryStream(); DataSet ds1 = new DataSet(new DataField <int>("id")); DataSet ds2; ds1.Add(5); //write using (var writer = new ParquetWriter(ms)) { writer.Write(ds1, CompressionMethod.Gzip); } //read back using (var reader = new ParquetReader(ms)) { ms.Position = 0; ds2 = reader.Read(); } Assert.Equal(5, ds2[0].GetInt(0)); //v3 const int value = 5; object actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod); Assert.Equal(5, (int)actual); }
public void Write_and_read_nullable_integers() { var ds = new DataSet(new SchemaElement <int>("id")) { 1, 2, 3, (object)null, 4, (object)null, 5 }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(ds1[0].GetInt(0), 1); Assert.Equal(ds1[1].GetInt(0), 2); Assert.Equal(ds1[2].GetInt(0), 3); Assert.True(ds1[3].IsNullAt(0)); Assert.Equal(ds1[4].GetInt(0), 4); Assert.True(ds1[5].IsNullAt(0)); Assert.Equal(ds1[6].GetInt(0), 5); }
public void I_can_write_snappy_and_read_back() { var ms = new MemoryStream(); var ds1 = new DataSet( new DataField<int>("id"), new DataField<int>("no")); ds1.Add(1, 3); ds1.Add(2, 4); DataSet ds2; //write using (var writer = new ParquetWriter(ms)) { writer.Write(ds1, CompressionMethod.Snappy); } //read back using (var reader = new ParquetReader(ms)) { ms.Position = 0; ds2 = reader.Read(); } Assert.Equal(1, ds2[0].GetInt(0)); Assert.Equal(2, ds2[1].GetInt(0)); Assert.Equal(3, ds2[0].GetInt(1)); Assert.Equal(4, ds2[1].GetInt(1)); }
protected override bool MoveNextCore() { if (_dataSetEnumerator.MoveNext()) { _curDataSetRow = _dataSetEnumerator.Current; return(true); } else if (_blockEnumerator.MoveNext()) { _readerOptions.Offset = (long)_blockEnumerator.Current * _readerOptions.Count; // When current dataset runs out, read the next portion of the parquet file. DataSet ds; lock (_loader._parquetStream) { ds = ParquetReader.Read(_loader._parquetStream, _loader._parquetOptions, _readerOptions); } var dataSetOrder = CreateOrderSequence(ds.RowCount); _dataSetEnumerator = dataSetOrder.GetEnumerator(); _curDataSetRow = dataSetOrder.ElementAt(0); // Cache list for each active column for (int i = 0; i < _actives.Length; i++) { Column col = _loader._columnsLoaded[_actives[i]]; _columnValues[i] = ds.GetColumn(col.DataField); } return(_dataSetEnumerator.MoveNext()); } return(false); }
public void Reads_really_mad_nested_file() { /* Spark schema: * root |-- addresses: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- line1: string (nullable = true) | | |-- name: string (nullable = true) | | |-- openingHours: array (nullable = true) | | | |-- element: long (containsNull = true) | | |-- postcode: string (nullable = true) |-- cities: array (nullable = true) | |-- element: string (containsNull = true) |-- comment: string (nullable = true) |-- id: long (nullable = true) |-- location: struct (nullable = true) | |-- latitude: double (nullable = true) | |-- longitude: double (nullable = true) |-- price: struct (nullable = true) | |-- lunch: struct (nullable = true) | | |-- max: long (nullable = true) | | |-- min: long (nullable = true) */ DataSet ds = ParquetReader.Read(OpenTestFile("nested.parquet")); //much easier to compare mad nestness with .ToString(), but will break when it changes Assert.Equal("{[{Dante Road;Head Office;[9;10;11;12;13;14;15;16;17;18];SE11};{Somewhere Else;Small Office;[6;7;19;20;21;22;23];TN19}];[London;Derby];this file contains all the permunations for nested structures and arrays to test Parquet parser;1;{51.2;66.3};{{2;1}}}", ds[0].ToString()); Assert.Equal("{[{Dante Road;Head Office;[9;10;11;12;13;14;15;16;17;18];SE11};{Somewhere Else;Small Office;[6;7;19;20;21;22;23];TN19}];[London;Derby];this file contains all the permunations for nested structures and arrays to test Parquet parser;1;{51.2;66.3};{{2;1}}}", ds[1].ToString()); }
protected override bool MoveNextCore() { if (_dataSetEnumerator.MoveNext()) { _curDataSetRow = (int)_dataSetEnumerator.Current; return(true); } else if (_blockEnumerator.MoveNext()) { _readerOptions.Offset = (int)_blockEnumerator.Current * _readerOptions.Count; // When current dataset runs out, read the next portion of the parquet file. DataSet ds; lock (_loader._parquetStream) { ds = ParquetReader.Read(_loader._parquetStream, _loader._parquetOptions, _readerOptions); } int[] dataSetOrder = _rand == null?Utils.GetIdentityPermutation(ds.RowCount) : Utils.GetRandomPermutation(_rand, ds.RowCount); _dataSetEnumerator = dataSetOrder.GetEnumerator(); _curDataSetRow = dataSetOrder[0]; // Cache list for each active column for (int i = 0; i < _actives.Length; i++) { Column col = _loader._columnsLoaded[_actives[i]]; _columnValues[i] = ds.GetColumn(col.DataField); } return(_dataSetEnumerator.MoveNext()); } return(false); }
public static DataSet WriteRead(DataSet original, WriterOptions writerOptions = null) { var ms = new MemoryStream(); ParquetWriter.Write(original, ms, CompressionMethod.None, null, writerOptions); ms.Position = 0; return(ParquetReader.Read(ms)); }
private DataSet ReadParquet(string name) { using (Stream s = OpenTestFile(name)) { return(ParquetReader.Read(s, new ParquetOptions { TreatByteArrayAsString = true })); } }
public void Read_all_nulls_file() { DataSet ds = ParquetReader.Read(OpenTestFile("all_nulls.parquet")); Assert.Equal(1, ds.Schema.Length); Assert.Equal("lognumber", ds.Schema[0].Name); Assert.Equal(1, ds.RowCount); Assert.Null(ds[0][0]); }
public void Read_simple_map() { DataSet ds = ParquetReader.Read(OpenTestFile("map.parquet")); Field ms = ds.Schema[1]; Assert.Equal("numbers", ms.Name); Assert.Equal("{1;[1=>one;2=>two;3=>three]}", ds[0].ToString()); }
public void FixedLenByteArray_dictionary() { using (Stream s = F.OpenRead(GetDataFilePath("fixedlenbytearray.parquet"))) { using (var r = new ParquetReader(s)) { DataSet ds = r.Read(); } } }
public void Datetypes_all() { using (Stream s = F.OpenRead(GetDataFilePath("dates.parquet"))) { using (var r = new ParquetReader(s)) { DataSet ds = r.Read(); } } }
public void List_of_elements_with_some_items_empty_reads_file() { DataSet ds = ParquetReader.Read(OpenTestFile("listofitems-empty-alternates.parquet")); Assert.Equal(4, ds.RowCount); Assert.Equal("{1;[1;2;3]}", ds[0].ToString()); Assert.Equal("{2;[]}", ds[1].ToString()); Assert.Equal("{3;[1;2;3]}", ds[2].ToString()); Assert.Equal("{4;[]}", ds[3].ToString()); }
public static DataSet WriteReadOpt(DataSet original, WriterOptions writerOptions = null) { var ms = new MemoryStream(); ParquetWriter.Write(original, ms, CompressionMethod.None, null, writerOptions); ms.Flush(); //System.IO.File.WriteAllBytes("c:\\tmp\\wr.parquet", ms.ToArray()); ms.Position = 0; return(ParquetReader.Read(ms)); }
public void Read_only_limited_columns() { var options = new ReaderOptions { Columns = new[] { "n_name", "n_regionkey" } }; DataSet ds = ParquetReader.Read(OpenTestFile("nation.impala.parquet"), null, options); Assert.Equal(2, ds.FieldCount); }
public void Read_all_legacy_decimals() { DataSet ds = ParquetReader.Read(OpenTestFile("decimallegacy.parquet")); Row row = ds[0]; Assert.Equal(1, (int)row[0]); Assert.Equal(1.2m, (decimal)row[1], 2); Assert.Null(row[2]); Assert.Equal(-1m, (decimal)row[3], 2); }
public void Reads_compat_customer_impala_file() { /* * c_name: * 45 pages (0-44) */ DataSet customer = ParquetReader.Read(OpenTestFile("customer.impala.parquet")); Assert.Equal(150000, customer.RowCount); }
protected override DataTable ReadRecordsFrom(int lowerPageBoundary, int rowsPerPage) { //var records = ParquetReader.Read(fileStream, options, readOptions); var records = dataFileReader.Read(); _totalRowCount = (int)records.TotalRowCount; var dataTable = convertToDataTable(records); return(dataTable); }
public void Reads_created_by_metadata() { DataSet ds = DataSetGenerator.Generate(10); var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.StartsWith("parquet-dotnet", ds1.Metadata.CreatedBy); }
public void Floats() { var ds = new DataSet(new SchemaElement <float>("f")); ds.Add((float)1.23); var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(ds[0].GetFloat(0), ds1[0].GetFloat(0)); }
public void Doubles() { var ds = new DataSet(new SchemaElement <double>("d")); ds.Add((double)12.34); var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(ds[0].GetDouble(0), ds1[0].GetDouble(0)); }
public void Datetypes_all() { DateTimeOffset offset, offset2; using (Stream s = OpenTestFile("dates.parquet")) { using (var r = new ParquetReader(s)) { DataSet ds = r.Read(); offset = (DateTimeOffset)ds[0][1]; offset2 = (DateTimeOffset)ds[1][1]; } } Assert.Equal(new DateTime(2017, 1, 1), offset.Date); Assert.Equal(new DateTime(2017, 2, 1), offset2.Date); }
public void Read_from_negative_offset_fails() { DataSet ds = DataSetGenerator.Generate(15); var wo = new WriterOptions { RowGroupsSize = 5 }; var ro = new ReaderOptions { Offset = -4, Count = 2 }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms, CompressionMethod.None, null, wo); ms.Position = 0; Assert.Throws <ParquetException>(() => ParquetReader.Read(ms, null, ro)); }
public void Type_write_byte_and_short_byte() { var schema = new Schema(new SchemaElement <sbyte>("sbyte"), new SchemaElement <byte>("byte")); var ds = new DataSet(schema) { { (sbyte)121, (byte)122 } }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(121, (sbyte)ds1[0][0]); Assert.Equal(122, (byte)ds1[0][1]); }
private ParquetLoader(Arguments args, IHost host, Stream stream) { Contracts.AssertValue(host, nameof(host)); _host = host; _host.CheckValue(args, nameof(args)); _host.CheckValue(stream, nameof(stream)); _host.CheckParam(stream.CanRead, nameof(stream), "input stream must be readable"); _host.CheckParam(stream.CanSeek, nameof(stream), "input stream must be seekable"); _host.CheckParam(stream.Position == 0, nameof(stream), "input stream must be at head"); using (var ch = _host.Start("Initializing host")) { _parquetStream = stream; _parquetOptions = new ParquetOptions() { TreatByteArrayAsString = true, TreatBigIntegersAsDates = args.TreatBigIntegersAsDates }; DataSet schemaDataSet; try { // We only care about the schema so ignore the rows. ReaderOptions readerOptions = new ReaderOptions() { Count = 0, Offset = 0 }; schemaDataSet = ParquetReader.Read(stream, _parquetOptions, readerOptions); _rowCount = schemaDataSet.TotalRowCount; } catch (Exception ex) { throw new InvalidDataException("Cannot read Parquet file", ex); } _columnChunkReadSize = args.ColumnChunkReadSize; _columnsLoaded = InitColumns(schemaDataSet); Schema = CreateSchema(_host, _columnsLoaded); } }
private void Read(IUnstructuredReader reader) { //i'm not sure how to read this any other way as Parquet needs seekable stream using (var ms = new MemoryStream()) { reader.BaseStream.CopyTo(ms); ms.Position = 0; _parquet = ParquetReader.Read(ms, new ParquetOptions() { TreatByteArrayAsString = true }); } _columnNameToIndex.Clear(); for (int i = 0; i < _parquet.Schema.Length; i++) { _columnNameToIndex[_parquet.Schema[i].Name] = i; } }