public void Read_multiple_data_pages() { using (var reader = new ParquetReader(OpenTestFile("/special/multi_data_page.parquet"), leaveStreamOpen: false)) { DataColumn[] columns = reader.ReadEntireRowGroup(); string[] s = (string[])columns[0].Data; double?[] d = (double?[])columns[1].Data; // check for nulls (issue #370) for (int i = 0; i < s.Length; i++) { Assert.True(s[i] != null, "found null in s at " + i); Assert.True(d[i] != null, "found null in d at " + i); } // run aggregations checking row alignment (issue #371) var seq = s.Zip(d.Cast <double>(), (w, v) => new { w, v }) .Where(p => p.w == "general") .ToList(); // double matching is fuzzy, but matching strings is enough for this test Assert.Equal("0.754359925788497", seq.Min(p => p.v).ToString(CultureInfo.InvariantCulture)); Assert.Equal("0.85776", seq.Max(p => p.v).ToString(CultureInfo.InvariantCulture)); } }
public void Read_multiple_data_pages() { using (var reader = new ParquetReader(OpenTestFile("/special/multi_data_page.parquet"), leaveStreamOpen: false)) { DataColumn[] columns = reader.ReadEntireRowGroup(); string[] s = (string[])columns[0].Data; double?[] d = (double?[])columns[1].Data; // check for nulls (issue #370) for (int i = 0; i < s.Length; i++) { Assert.True(s[i] != null, "found null in s at " + i); Assert.True(d[i] != null, "found null in d at " + i); } // run aggregations checking row alignment (issue #371) var seq = s.Zip(d.Cast <double>(), (w, v) => new { w, v }) .Where(p => p.w == "favorable") .ToList(); // double matching is fuzzy, but matching strings is enough for this test // ground truth was computed using Spark Assert.Equal(26706.6185312147, seq.Sum(p => p.v), 5); Assert.Equal(0.808287234987281, seq.Average(p => p.v), 5); Assert.Equal(0.71523915461624, seq.Min(p => p.v), 5); Assert.Equal(0.867111980015206, seq.Max(p => p.v), 5); } }
public List <DataEntity> ParseFileSchema(DataContainer container, DataCollection collection, Stream fileStream, out long rowCount) { var entities = new List <DataEntity>(); var options = new ParquetOptions { TreatByteArrayAsString = true }; var reader = new ParquetReader(fileStream, options); var schema = reader.Schema; var fields = schema.GetDataFields(); foreach (var field in fields) { entities.Add(new DataEntity(field.Name, ConvertDataType(field.DataType), Enum.GetName(typeof(Parquet.Data.DataType), field.DataType), container, collection)); } rowCount = 0; for (int i = 0; i < reader.RowGroupCount; i++) { var columns = reader.ReadEntireRowGroup(i); rowCount += columns[0].Data.Length; } return(entities); }
public void Read_hardcoded_decimal() { using (var reader = new ParquetReader(OpenTestFile("complex-primitives.parquet"))) { decimal value = (decimal)reader.ReadEntireRowGroup()[1].Data.GetValue(0); Assert.Equal((decimal)1.2, value); } }
private static decimal[] ReadFile(string filename) { using var stream = File.OpenRead(filename); using var parquetReader = new ParquetReader(stream); var results = parquetReader.ReadEntireRowGroup(); return((decimal[])results[0].Data); }
public void FixedLenByteArray_dictionary() { using (Stream s = OpenTestFile("fixedlenbytearray.parquet")) { using (var r = new ParquetReader(s)) { DataColumn[] columns = r.ReadEntireRowGroup(); } } }
public void Read_simple_map() { using (var reader = new ParquetReader(OpenTestFile("map_simple.parquet"), leaveStreamOpen: false)) { DataColumn[] data = reader.ReadEntireRowGroup(); Assert.Equal(new int?[] { 1 }, data[0].Data); Assert.Equal(new int[] { 1, 2, 3 }, data[1].Data); Assert.Equal(new string[] { "one", "two", "three" }, data[2].Data); } }
public DataColumn[] ParquetDotNet() { using var stream = File.OpenRead(Filename); using var parquetReader = new ParquetReader(stream); var results = parquetReader.ReadEntireRowGroup(); if (Check.Enabled) { Check.ArraysAreEqual(_values, (decimal[])results[0].Data); } return(results); }
public void ParquetReader_EmptyColumn() { using (var reader = new ParquetReader(OpenTestFile("emptycolumn.parquet"), leaveStreamOpen: false)) { DataColumn[] columns = reader.ReadEntireRowGroup(); int?[] col0 = (int?[])columns[0].Data; Assert.Equal(10, col0.Length); foreach (int?value in col0) { Assert.Null(value); } } }
/// <summary> /// Lee un registro /// </summary> public bool Read() { bool readed = false; // Recorre los grupos de filas del archivo if (_groupRowColumns == null || _actualRow >= _groupRowColumns[0].Data.Length) { // Obtiene el lector con el grupo de filas if (_rowGroup < _parquetReader.RowGroupCount) { _groupRowColumns = _parquetReader.ReadEntireRowGroup(_rowGroup).ToArray(); } else { _groupRowColumns = null; } // Incrementa el número de grupo y cambia la fila actual _rowGroup++; _actualRow = 0; } // Obtiene los datos (si queda algo por leer) if (_groupRowColumns != null) { // Transforma las columnas _rowValues = new List <object>(); foreach (DataColumn column in _groupRowColumns) { object value = column.Data.GetValue(_actualRow); // Parquet almacena las fechas como DateTimeOffset y se debe convertir a un dateTime if (value is DateTimeOffset date) { value = ConvertFromDateTimeOffset(date); } // Añade el valor _rowValues.Add(value); } // Indica que se ha leido el registro e incrementa la fila actual readed = true; _actualRow++; // Incrementa la fila total y lanza el evento _row++; if (_row % NotifyAfter == 0) { RaiseEventReadBlock(_row); } } // Devuelve el valor que indica si se ha leido un registro return(readed); }
public void Read_bit_packed_at_page_boundary() { using (var reader = new ParquetReader(OpenTestFile("/special/multi_page_bit_packed_near_page_border.parquet"))) { DataColumn[] columns = reader.ReadEntireRowGroup(); string[] data = (string[])columns[0].Data; // ground truth from spark Assert.Equal(30855, data.Count(string.IsNullOrEmpty)); // check page boundary Assert.Equal("collateral_natixis_fr_vol5010", data[60355]); Assert.Equal("BRAZ82595832_vol16239", data[60356]); } }
public void Reads_byte_arrays() { byte[] nameValue; byte[] expectedValue = Encoding.UTF8.GetBytes("ALGERIA"); using (var reader = new ParquetReader(OpenTestFile(@"real/nation.plain.parquet"), leaveStreamOpen: false)) { DataColumn[] data = reader.ReadEntireRowGroup(); byte[][] nameColumn = (byte[][])data[1].Data; nameValue = nameColumn[0]; Assert.Equal <IEnumerable <byte> >(expectedValue, nameValue); } Assert.Equal <IEnumerable <byte> >(expectedValue, nameValue); }
public DataColumn[] ParquetDotNet() { using var stream = File.OpenRead(Filename); using var parquetReader = new ParquetReader(stream); var results = parquetReader.ReadEntireRowGroup(); if (Check.Enabled) { Check.ArraysAreEqual(_allDatesAsDateTimeOffsets, (DateTimeOffset[])results[0].Data); Check.ArraysAreEqual(_allObjectIds, (int[])results[1].Data); Check.ArraysAreEqual(_allValues, (float[])results[2].Data); } return(results); }
public void Datetypes_all() { DateTimeOffset offset, offset2; using (Stream s = OpenTestFile("dates.parquet")) { using (var r = new ParquetReader(s)) { DataColumn[] columns = r.ReadEntireRowGroup(); offset = (DateTimeOffset)(columns[1].Data.GetValue(0)); offset2 = (DateTimeOffset)(columns[1].Data.GetValue(1)); } } Assert.Equal(new DateTime(2017, 1, 1), offset.Date); Assert.Equal(new DateTime(2017, 2, 1), offset2.Date); }
public void ReadLargeTimestampData() { using (var reader = new ParquetReader(OpenTestFile("/mixed-dictionary-plain.parquet"), leaveStreamOpen: false)) { DataColumn[] columns = reader.ReadEntireRowGroup(); DateTimeOffset?[] col0 = (DateTimeOffset?[])columns[0].Data; Assert.Equal(440773, col0.Length); long ticks = col0[0].Value.Ticks; for (int i = 1; i < 132000; i++) { long now = col0[i].Value.Ticks; Assert.NotEqual(ticks, now); } } }
public void DateTime_FromOtherSystem() { DateTimeOffset offset; using (Stream s = OpenTestFile("datetime_other_system.parquet")) { using (var r = new ParquetReader(s)) { DataColumn[] columns = r.ReadEntireRowGroup(); DataColumn as_at_date_col = columns.FirstOrDefault(x => x.Field.Name == "as_at_date_"); Assert.NotNull(as_at_date_col); offset = (DateTimeOffset)(as_at_date_col.Data.GetValue(0)); Assert.Equal(new DateTime(2018, 12, 14, 0, 0, 0), offset.Date); } } }
public void Reads_multi_page_file() { using (var reader = new ParquetReader(OpenTestFile("multi.page.parquet"), leaveStreamOpen: false)) { DataColumn[] data = reader.ReadEntireRowGroup(); Assert.Equal(927861, data[0].Data.Length); int[] firstColumn = (int[])data[0].Data; Assert.Equal(30763, firstColumn[524286]); Assert.Equal(30766, firstColumn[524287]); //At row 524288 the data is split into another page //The column makes use of a dictionary to reduce the number of values and the default dictionary index value is zero (i.e. the first record value) Assert.NotEqual(firstColumn[0], firstColumn[524288]); //The value should be 30768 Assert.Equal(30768, firstColumn[524288]); } }
public void Read_multi_page_dictionary_with_nulls() { using (var reader = new ParquetReader(OpenTestFile("/special/multi_page_dictionary_with_nulls.parquet"))) { DataColumn[] columns = reader.ReadEntireRowGroup(); var rg = reader.OpenRowGroupReader(0); // reading columns var data = (string[])columns[0].Data; // ground truth from spark // check page boundary (first page contains 107432 rows) Assert.Equal("xc3w4eudww", data[107432]); Assert.Equal("bpywp4wtwk", data[107433]); Assert.Equal("z6x8652rle", data[107434]); // check near the end of the file Assert.Null(data[310028]); Assert.Equal("wok86kie6c", data[310029]); Assert.Equal("le9i7kbbib", data[310030]); } }
public List <string> CollectSamples(DataContainer container, DataCollection collection, DataEntity entity, int entityIndex, Stream fileStream, int maxSamples, double probability) { var result = new List <string>(); var rand = new Random(); var options = new ParquetOptions { TreatByteArrayAsString = true }; var reader = new ParquetReader(fileStream, options); for (int i = 0; i < reader.RowGroupCount; i++) { var columns = reader.ReadEntireRowGroup(i); var column = columns.FirstOrDefault(x => x.Field.Name.Equals(entity.Name)); if (column != null) { for (int j = 0; j < column.Data.Length; j++) { if (rand.NextDouble() < probability) { result.Add(column.Data.GetValue(j)?.ToString()); if (result.Count >= maxSamples) { break; } } } } if (result.Count >= maxSamples) { break; } } return(result); }
public static void TestReadFloatTimeSeries([Values(0, 1, 2, 3, 5)] int warmup) { var timer = Stopwatch.StartNew(); Console.WriteLine("Generating data..."); var(dates, objectIds, values, numRows) = CreateFloatDataFrame(3600); Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", numRows, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet"); timer.Restart(); const string filename = "float_timeseries.parquet"; using (var fileWriter = new ParquetFileWriter(filename, CreateFloatColumns(), Compression.Snappy)) { using var rowGroupWriter = fileWriter.AppendRowGroup(); using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { for (int i = 0; i != dates.Length; ++i) { dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray()); } } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { for (int i = 0; i != dates.Length; ++i) { objectIdWriter.WriteBatch(objectIds); } } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { for (int i = 0; i != dates.Length; ++i) { valueWriter.WriteBatch(values[i]); } } fileWriter.Close(); } var fileLength = new FileInfo(filename).Length; Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", fileLength, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Reading from Parquet"); timer.Restart(); using (var fileReader = new ParquetFileReader(filename)) { using var groupReader = fileReader.RowGroup(0); using (var dateTimeReader = groupReader.Column(0).LogicalReader <DateTime>()) { dateTimeReader.ReadAll(numRows); } using (var objectIdReader = groupReader.Column(1).LogicalReader <int>()) { objectIdReader.ReadAll(numRows); } using (var valueReader = groupReader.Column(2).LogicalReader <float>()) { valueReader.ReadAll(numRows); } } Console.WriteLine("Read Parquet ({0:N0} bytes) in {1:N3} sec", fileLength, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Reading from Parquet (Parquet.NET)"); timer.Restart(); using (var stream = File.OpenRead(filename)) { using var parquetReader = new ParquetReader(stream); parquetReader.ReadEntireRowGroup(); } Console.WriteLine("Read Parquet (Parquet.NET {0:N0} bytes) in {1:N3} sec", fileLength, timer.Elapsed.TotalSeconds); Console.WriteLine(); }