private void ReadDataBackgroundWorker_DoWork(object sender, DoWorkEventArgs e) { //Parquet.NET doesn't have any async methods or readers that allow sequential records reading so we need to use the ThreadPool to support cancellation. var task = Task <ParquetReadResult> .Run(() => { //Unfortunately there's no way to quickly get the metadata from a parquet file without reading an actual data row //BUG: Parquet.NET doesn't always respect the Count parameter, sometimes returning more than the passed value... using (var parquetReader = ParquetReader.OpenFromFile(this.OpenFilePath, new ParquetOptions() { TreatByteArrayAsString = true })) { int totalRowCount = 0; DataTable result = UtilityMethods.ParquetReaderToDataTable(parquetReader, out totalRowCount, this.SelectedFields, this.CurrentOffset, this.CurrentMaxRowCount); return(new ParquetReadResult(result, totalRowCount)); } }); while (!task.IsCompleted && !((BackgroundWorker)sender).CancellationPending) { task.Wait(1000); } e.Cancel = ((BackgroundWorker)sender).CancellationPending; if (task.IsCompleted) { e.Result = task.Result; } }
public static void TestAgainstThirdParty() { var columns = new Column[] { new Column <decimal>("Decimal", LogicalType.Decimal(precision: 29, scale: 3)) }; var values = Enumerable.Range(0, 10_000) .Select(i => ((decimal)i * i * i) / 1000 - 10) .Concat(new [] { decimal.MinValue / 1000, decimal.MaxValue / 1000 }) .ToArray(); using var buffer = new ResizableBuffer(); // Write using ParquetSharp using (var outStream = new BufferOutputStream(buffer)) { using var fileWriter = new ParquetFileWriter(outStream, columns, Compression.Snappy); using var rowGroupWriter = fileWriter.AppendRowGroup(); using var columnWriter = rowGroupWriter.NextColumn().LogicalWriter <decimal>(); columnWriter.WriteBatch(values); fileWriter.Close(); } // Read using Parquet.NET using var memoryStream = new MemoryStream(buffer.ToArray()); using var fileReader = new ParquetReader(memoryStream); using var rowGroupReader = fileReader.OpenRowGroupReader(0); var read = (decimal[])rowGroupReader.ReadColumn(fileReader.Schema.GetDataFields()[0]).Data; Assert.AreEqual(values, read); }
public void FileMetadata_sets_num_rows_on_file_and_row_group_multiple_row_groups() { var ms = new MemoryStream(); var id = new DataField <int>("id"); //write using (var writer = new ParquetWriter(new Schema(id), ms)) { using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new[] { 1, 2, 3, 4 })); } using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new[] { 5, 6 })); } } //read back using (var reader = new ParquetReader(ms)) { Assert.Equal(6, reader.ThriftMetadata.Num_rows); using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(4, rg.RowCount); } using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1)) { Assert.Equal(2, rg.RowCount); } } }
public override IEnumerable <object> AsEnumerable(object source, Func <object, bool?> filterFunc = null) { if (source == null) { yield break; } ParquetReader sr = source as ParquetReader; ChoGuard.ArgumentNotNull(sr, "ParquetReader"); InitializeRecordConfiguration(Configuration); if (!RaiseBeginLoad(sr)) { yield break; } if (InterceptRowGroup) { foreach (var item in AsEnumerable(ReadObjectsByRowGroup(sr).SelectMany(i => i.Select(i1 => i1)), TraceSwitch, filterFunc)) { yield return(item); } } else { foreach (var item in AsEnumerable(ReadAllObjects(sr), TraceSwitch, filterFunc)) { yield return(item); } } RaiseEndLoad(sr); }
public void List_of_elements_with_some_items_empty_reads_file() { /* * list data: * - 1: [1, 2, 3] * - 2: [] * - 3: [1, 2, 3] * - 4: [] */ using (var reader = new ParquetReader(OpenTestFile("listofitems-empty-alternates.parquet"))) { using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(0)) { Assert.Equal(4, groupReader.RowCount); DataField[] fs = reader.Schema.GetDataFields(); DataColumn id = groupReader.ReadColumn(fs[0]); Assert.Equal(4, id.Data.Length); Assert.False(id.HasRepetitions); DataColumn list = groupReader.ReadColumn(fs[1]); Assert.Equal(8, list.Data.Length); Assert.Equal(new int[] { 0, 1, 1, 0, 0, 1, 1, 0 }, list.RepetitionLevels); } } }
public void Special_read_file_with_multiple_row_groups() { var ms = new MemoryStream(); //create multirowgroup file //first row group var t = new Table(new DataField <int>("id")); t.Add(1); t.Add(2); using (var writer = new ParquetWriter(t.Schema, ms)) { writer.Write(t); } //second row group t.Clear(); t.Add(3); t.Add(4); using (var writer = new ParquetWriter(t.Schema, ms, null, true)) { writer.Write(t); } //read back as table t = ParquetReader.ReadTableFromStream(ms); Assert.Equal(4, t.Count); }
public void Read_simple_repeated_field() { /* * root |-- cities: array (nullable = true) | |-- element: string (containsNull = true) |-- id: long (nullable = true) */ DataSet ds = ParquetReader.ReadFile(GetDataFilePath("simplerepeated.parquet")); Assert.Equal(2, ds.Schema.Length); Assert.Equal(typeof(IEnumerable <string>), ds.Schema[0].ColumnType); Assert.Equal(typeof(string), ds.Schema[0].ElementType); Assert.Equal(typeof(long), ds.Schema[1].ElementType); Assert.Equal("cities", ds.Schema[0].Name); Assert.Equal("id", ds.Schema[1].Name); Assert.True(ds.Schema[0].IsRepeated); Assert.False(ds.Schema[1].IsRepeated); Assert.Equal(1L, ds[0][1]); Assert.Equal(ds[0][0], new[] { "London", "Derby", "Paris", "New York" }); }
public override IEnumerable <object> AsEnumerable(object source, Func <object, bool?> filterFunc = null) { if (source == null) { yield break; } ParquetReader sr = source as ParquetReader; ChoGuard.ArgumentNotNull(sr, "ParquetReader"); InitializeRecordConfiguration(Configuration); if (!RaiseBeginLoad(sr)) { yield break; } foreach (var item in AsEnumerable(ReadObjects(sr), TraceSwitch, filterFunc)) { yield return(item); } RaiseEndLoad(sr); }
public IEnumerator <T> GetEnumerator() { CheckDisposed(); ChoParquetRecordReader rr = new ChoParquetRecordReader(typeof(T), Configuration); if (_streamReader != null) { _parquetReader = Create(_streamReader.Value); } rr.Reader = this; rr.TraceSwitch = TraceSwitch; rr.RowsLoaded += NotifyRowsLoaded; rr.BeforeRowGroupLoad += BeforeRowGroupLoad; rr.AfterRowGroupLoaded += AfterRowGroupLoaded; rr.MembersDiscovered += MembersDiscovered; rr.RecordFieldTypeAssessment += RecordFieldTypeAssessment; var beforeRowGroup = BeforeRowGroupLoad; var afterRowGroup = AfterRowGroupLoaded; if (beforeRowGroup != null || afterRowGroup != null) { rr.InterceptRowGroup = true; } var e = rr.AsEnumerable(_parquetReader).GetEnumerator(); return(ChoEnumeratorWrapper.BuildEnumerable <T>(() => { ++_recordNumber; return e.MoveNext(); }, () => (T)ChoConvert.ChangeType <ChoRecordFieldAttribute>(e.Current, typeof(T)), () => Dispose()).GetEnumerator()); }
private void CompareWithMr(Table t) { string testFileName = Path.GetFullPath("temp.parquet"); if (F.Exists(testFileName)) { F.Delete(testFileName); } //produce file using (Stream s = F.OpenWrite(testFileName)) { using (var writer = new ParquetWriter(t.Schema, s)) { writer.Write(t); } } //read back Table t2 = ParquetReader.ReadTableFromFile(testFileName); //check we don't have a bug internally before launching MR Assert.Equal(t.ToString("j"), t2.ToString("j"), ignoreLineEndingDifferences: true); string mrJson = ExecAndGetOutput(_javaExecName, $"-jar {_toolsJarPath} cat -j {testFileName}"); Assert.Equal(t.ToString("j"), mrJson); }
public void Read_multiple_data_pages() { using (var reader = new ParquetReader(OpenTestFile("/special/multi_data_page.parquet"), leaveStreamOpen: false)) { DataColumn[] columns = reader.ReadEntireRowGroup(); string[] s = (string[])columns[0].Data; double?[] d = (double?[])columns[1].Data; // check for nulls (issue #370) for (int i = 0; i < s.Length; i++) { Assert.True(s[i] != null, "found null in s at " + i); Assert.True(d[i] != null, "found null in d at " + i); } // run aggregations checking row alignment (issue #371) var seq = s.Zip(d.Cast <double>(), (w, v) => new { w, v }) .Where(p => p.w == "general") .ToList(); // double matching is fuzzy, but matching strings is enough for this test Assert.Equal("0.754359925788497", seq.Min(p => p.v).ToString(CultureInfo.InvariantCulture)); Assert.Equal("0.85776", seq.Max(p => p.v).ToString(CultureInfo.InvariantCulture)); } }
public void Write_and_read_nullable_integers() { var ds = new DataSet(new DataField <int?>("id")) { 1, 2, 3, (object)null, 4, (object)null, 5 }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(1, ds1[0].GetInt(0)); Assert.Equal(2, ds1[1].GetInt(0)); Assert.Equal(3, ds1[2].GetInt(0)); Assert.True(ds1[3].IsNullAt(0)); Assert.Equal(4, ds1[4].GetInt(0)); Assert.True(ds1[5].IsNullAt(0)); Assert.Equal(5, ds1[6].GetInt(0)); }
public void Reads_really_mad_nested_file() { /* Spark schema: * root |-- addresses: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- line1: string (nullable = true) | | |-- name: string (nullable = true) | | |-- openingHours: array (nullable = true) | | | |-- element: long (containsNull = true) | | |-- postcode: string (nullable = true) |-- cities: array (nullable = true) | |-- element: string (containsNull = true) |-- comment: string (nullable = true) |-- id: long (nullable = true) |-- location: struct (nullable = true) | |-- latitude: double (nullable = true) | |-- longitude: double (nullable = true) |-- price: struct (nullable = true) | |-- lunch: struct (nullable = true) | | |-- max: long (nullable = true) | | |-- min: long (nullable = true) */ DataSet ds = ParquetReader.Read(OpenTestFile("nested.parquet")); //much easier to compare mad nestness with .ToString(), but will break when it changes Assert.Equal("{[{Dante Road;Head Office;[9;10;11;12;13;14;15;16;17;18];SE11};{Somewhere Else;Small Office;[6;7;19;20;21;22;23];TN19}];[London;Derby];this file contains all the permunations for nested structures and arrays to test Parquet parser;1;{51.2;66.3};{{2;1}}}", ds[0].ToString()); Assert.Equal("{[{Dante Road;Head Office;[9;10;11;12;13;14;15;16;17;18];SE11};{Somewhere Else;Small Office;[6;7;19;20;21;22;23];TN19}];[London;Derby];this file contains all the permunations for nested structures and arrays to test Parquet parser;1;{51.2;66.3};{{2;1}}}", ds[1].ToString()); }
public void Type_writes_and_reads_end_to_end(SchemaElement schema, object value, string name = null) { var ds = new DataSet(schema) { new Row(value) }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); object expectedValue = ds[0][0]; object actualValue = ds1[0][0]; if (schema.ElementType == typeof(DateTime)) { actualValue = ((DateTimeOffset)actualValue).DateTime; } Assert.True(expectedValue.Equals(actualValue), $"{name}| expected: {expectedValue}, actual: {actualValue}, schema element: {schema}"); //if (schema.ElementType == typeof(decimal)) ParquetWriter.WriteFile(ds1, "c:\\tmp\\decimals.parquet"); }
public void List_simple_element_write_read() { var table = new Table( new Schema( new DataField <int>("id"), new ListField("cities", new DataField <string>("name")))); var ms = new MemoryStream(); table.Add(1, new[] { "London", "Derby" }); table.Add(2, new[] { "Paris", "New York" }); //write as table using (var writer = new ParquetWriter(table.Schema, ms)) { writer.Write(table); } //read back into table ms.Position = 0; Table table2; using (var reader = new ParquetReader(ms)) { table2 = reader.ReadAsTable(); } //validate data Assert.Equal(table.ToString(), table2.ToString(), ignoreLineEndingDifferences: true); }
public static async Task <DataSet> LoadAsync(StorageFile file, int offset = 0, int count = 100) { using (IRandomAccessStreamWithContentType uwpStream = await file.OpenReadAsync()) { using (Stream stream = uwpStream.AsStreamForRead()) { var readerOptions = new ReaderOptions() { Offset = offset, Count = count }; var formatOptions = new ParquetOptions { TreatByteArrayAsString = true }; try { return(ParquetReader.Read(stream, formatOptions, readerOptions)); } catch (Exception ex) { var dialog = new MessageDialog(ex.Message, "Cannot open file"); await dialog.ShowAsync(); return(null); } } } }
public void Flat_write_read() { var table = new Table(new Schema(new DataField <int>("id"), new DataField <string>("city"))); var ms = new MemoryStream(); //generate fake data for (int i = 0; i < 1000; i++) { table.Add(new Row(i, "record#" + i)); } //write to stream using (var writer = new ParquetWriter(table.Schema, ms)) { writer.Write(table); } //read back into table ms.Position = 0; Table table2; using (var reader = new ParquetReader(ms)) { table2 = reader.ReadAsTable(); } //validate data Assert.True(table.Equals(table2, true)); }
public void Write_and_read_nullable_integers() { var ds = new DataSet(new SchemaElement <int>("id")) { 1, 2, 3, (object)null, 4, (object)null, 5 }; var ms = new MemoryStream(); ParquetWriter.Write(ds, ms); ms.Position = 0; DataSet ds1 = ParquetReader.Read(ms); Assert.Equal(ds1[0].GetInt(0), 1); Assert.Equal(ds1[1].GetInt(0), 2); Assert.Equal(ds1[2].GetInt(0), 3); Assert.True(ds1[3].IsNullAt(0)); Assert.Equal(ds1[4].GetInt(0), 4); Assert.True(ds1[5].IsNullAt(0)); Assert.Equal(ds1[6].GetInt(0), 5); }
public void Array_write_read() { var table = new Table( new Schema( new DataField <int>("id"), new DataField <string[]>("categories") //array field ) ); var ms = new MemoryStream(); table.Add(1, new[] { "1", "2", "3" }); table.Add(3, new[] { "3", "3", "3" }); //write to stream using (var writer = new ParquetWriter(table.Schema, ms)) { writer.Write(table); } //System.IO.File.WriteAllBytes("c:\\tmp\\1.parquet", ms.ToArray()); //read back into table ms.Position = 0; Table table2; using (var reader = new ParquetReader(ms)) { table2 = reader.ReadAsTable(); } //validate data Assert.Equal(table.ToString(), table2.ToString(), ignoreLineEndingDifferences: true); }
public ParquetDataReader(ParquetReader parquetReader, IEnumerable <DataField> fields, IEnumerable <ColumnConverter> columnConverters) { _parquetReader = parquetReader ?? throw new ArgumentNullException(nameof(parquetReader)); _fields = fields?.ToList() ?? throw new ArgumentNullException(nameof(fields)); _rowGroupCount = parquetReader.RowGroupCount; _columnConverters = new Lazy <List <Func <object, object> > >(() => InitializeColumnConverters(columnConverters), true); }
public void Read_simple_nested_field() { /* * root |-- city: struct (nullable = true) | |-- country: string (nullable = true) | |-- isCapital: boolean (nullable = true) | |-- name: string (nullable = true) |-- id: long (nullable = true) */ //Assert.Throws<NotSupportedException>(() => ParquetReader.ReadFile(GetDataFilePath("simplenested.parquet"))); //return; DataSet ds = ParquetReader.ReadFile(GetDataFilePath("simplenested.parquet")); Assert.Equal(1, ds.RowCount); Assert.Equal(2, ds.ColumnCount); Assert.Equal(typeof(Row), ds.Schema[0].ElementType); Assert.Equal(typeof(long), ds.Schema[1].ElementType); Assert.Equal("city", ds.Schema.ColumnNames[0]); Assert.Equal("id", ds.Schema.ColumnNames[1]); Row mr = ds[0]; Row city = mr.Get <Row>(0); Assert.Equal("United Kingdom", city[0]); Assert.True((bool)city[1]); Assert.Equal("London", city[2]); Assert.Equal(1L, mr[1]); }
public static Dictionary <int, string> ReadParquetFile(string infile) { Dictionary <int, string> serializedRequests = new Dictionary <int, string>(); string path = Path.GetFullPath(Directory.GetCurrentDirectory() + "/" + infile); using (Stream fileStream = File.OpenRead(path)) { using (var parquetReader = new ParquetReader(fileStream)) { DataField[] dataFields = parquetReader.Schema.GetDataFields(); using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(0)) { DataColumn[] columns = dataFields.Select(groupReader.ReadColumn).ToArray(); DataColumn firstColumn = columns[0]; DataColumn secondColumn = columns[1]; Array idData = firstColumn.Data; Array requestData = secondColumn.Data; for (var j = 0; j < firstColumn.Data.Length; j++) { var convertedRequestData = (string)requestData.GetValue(j); var convertedIdData = (int)idData.GetValue(j); serializedRequests.Add(convertedIdData, convertedRequestData); } } return(serializedRequests); } } }
public void Read_multiple_data_pages() { using (var reader = new ParquetReader(OpenTestFile("/special/multi_data_page.parquet"), leaveStreamOpen: false)) { DataColumn[] columns = reader.ReadEntireRowGroup(); string[] s = (string[])columns[0].Data; double?[] d = (double?[])columns[1].Data; // check for nulls (issue #370) for (int i = 0; i < s.Length; i++) { Assert.True(s[i] != null, "found null in s at " + i); Assert.True(d[i] != null, "found null in d at " + i); } // run aggregations checking row alignment (issue #371) var seq = s.Zip(d.Cast <double>(), (w, v) => new { w, v }) .Where(p => p.w == "favorable") .ToList(); // double matching is fuzzy, but matching strings is enough for this test // ground truth was computed using Spark Assert.Equal(26706.6185312147, seq.Sum(p => p.v), 5); Assert.Equal(0.808287234987281, seq.Average(p => p.v), 5); Assert.Equal(0.71523915461624, seq.Min(p => p.v), 5); Assert.Equal(0.867111980015206, seq.Max(p => p.v), 5); } }
public void All_compression_methods_supported(CompressionMethod compressionMethod) { //v2 var ms = new MemoryStream(); DataSet ds1 = new DataSet(new DataField <int>("id")); DataSet ds2; ds1.Add(5); //write using (var writer = new ParquetWriter(ms)) { writer.Write(ds1, CompressionMethod.Gzip); } //read back using (var reader = new ParquetReader(ms)) { ms.Position = 0; ds2 = reader.Read(); } Assert.Equal(5, ds2[0].GetInt(0)); //v3 const int value = 5; object actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod); Assert.Equal(5, (int)actual); }
protected override bool MoveNextCore() { if (_dataSetEnumerator.MoveNext()) { _curDataSetRow = _dataSetEnumerator.Current; return(true); } else if (_blockEnumerator.MoveNext()) { _readerOptions.Offset = (long)_blockEnumerator.Current * _readerOptions.Count; // When current dataset runs out, read the next portion of the parquet file. DataSet ds; lock (_loader._parquetStream) { ds = ParquetReader.Read(_loader._parquetStream, _loader._parquetOptions, _readerOptions); } var dataSetOrder = CreateOrderSequence(ds.RowCount); _dataSetEnumerator = dataSetOrder.GetEnumerator(); _curDataSetRow = dataSetOrder.ElementAt(0); // Cache list for each active column for (int i = 0; i < _actives.Length; i++) { Column col = _loader._columnsLoaded[_actives[i]]; _columnValues[i] = ds.GetColumn(col.DataField); } return(_dataSetEnumerator.MoveNext()); } return(false); }
public void ReadIntro() { // open file stream using (Stream fileStream = System.IO.File.OpenRead("c:\\test.parquet")) { // open parquet file reader using (var parquetReader = new ParquetReader(fileStream)) { // get file schema (available straight after opening parquet reader) // however, get only data fields as only they contain data values DataField[] dataFields = parquetReader.Schema.GetDataFields(); // enumerate through row groups in this file for (int i = 0; i < parquetReader.RowGroupCount; i++) { // create row group reader using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i)) { // read all columns inside each row group (you have an option to read only // required columns if you need to. DataColumn[] columns = dataFields.Select(groupReader.ReadColumn).ToArray(); // get first column, for instance DataColumn firstColumn = columns[0]; // .Data member contains a typed array of column data you can cast to the type of the column Array data = firstColumn.Data; int[] ids = (int[])data; } } } } }
public void Write_read_nullable_column(Array input) { var id = new DataField <int?>("id"); var ms = new MemoryStream(); using (var writer = new ParquetWriter(new Schema(id), ms)) { using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, input)); } } ms.Position = 0; using (var reader = new ParquetReader(ms)) { Assert.Equal(1, reader.RowGroupCount); using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(input.Length, rg.RowCount); Assert.Equal(input, rg.ReadColumn(id).Data); } } }
public void I_can_write_snappy_and_read_back() { var ms = new MemoryStream(); var ds1 = new DataSet( new DataField<int>("id"), new DataField<int>("no")); ds1.Add(1, 3); ds1.Add(2, 4); DataSet ds2; //write using (var writer = new ParquetWriter(ms)) { writer.Write(ds1, CompressionMethod.Snappy); } //read back using (var reader = new ParquetReader(ms)) { ms.Position = 0; ds2 = reader.Read(); } Assert.Equal(1, ds2[0].GetInt(0)); Assert.Equal(2, ds2[1].GetInt(0)); Assert.Equal(3, ds2[0].GetInt(1)); Assert.Equal(4, ds2[1].GetInt(1)); }
public void CustomMetadata_can_write_and_read() { var ms = new MemoryStream(); var id = new DataField <int>("id"); //write using (var writer = new ParquetWriter(new Schema(id), ms)) { writer.CustomMetadata = new Dictionary <string, string> { ["key1"] = "value1", ["key2"] = "value2" }; using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { rg.WriteColumn(new DataColumn(id, new[] { 1, 2, 3, 4 })); } } //read back using (var reader = new ParquetReader(ms)) { Assert.Equal("value1", reader.CustomMetadata["key1"]); Assert.Equal("value2", reader.CustomMetadata["key2"]); } }
protected Thrift.FileMetaData ReadInternalMetadata() { using (var reader = ParquetReader.OpenFromFile(_path)) { return(reader.ThriftMetadata); } }