public void Read_multiple_data_pages()
        {
            using (var reader =
                       new ParquetReader(OpenTestFile("/special/multi_data_page.parquet"), leaveStreamOpen: false))
            {
                DataColumn[] columns = reader.ReadEntireRowGroup();

                string[]  s = (string[])columns[0].Data;
                double?[] d = (double?[])columns[1].Data;

                // check for nulls (issue #370)
                for (int i = 0; i < s.Length; i++)
                {
                    Assert.True(s[i] != null, "found null in s at " + i);
                    Assert.True(d[i] != null, "found null in d at " + i);
                }

                // run aggregations checking row alignment (issue #371)
                var seq = s.Zip(d.Cast <double>(), (w, v) => new { w, v })
                          .Where(p => p.w == "general")
                          .ToList();

                // double matching is fuzzy, but matching strings is enough for this test
                Assert.Equal("0.754359925788497", seq.Min(p => p.v).ToString(CultureInfo.InvariantCulture));
                Assert.Equal("0.85776", seq.Max(p => p.v).ToString(CultureInfo.InvariantCulture));
            }
        }
Ejemplo n.º 2
0
        public void Read_multiple_data_pages()
        {
            using (var reader =
                       new ParquetReader(OpenTestFile("/special/multi_data_page.parquet"), leaveStreamOpen: false))
            {
                DataColumn[] columns = reader.ReadEntireRowGroup();

                string[]  s = (string[])columns[0].Data;
                double?[] d = (double?[])columns[1].Data;

                // check for nulls (issue #370)
                for (int i = 0; i < s.Length; i++)
                {
                    Assert.True(s[i] != null, "found null in s at " + i);
                    Assert.True(d[i] != null, "found null in d at " + i);
                }

                // run aggregations checking row alignment (issue #371)
                var seq = s.Zip(d.Cast <double>(), (w, v) => new { w, v })
                          .Where(p => p.w == "favorable")
                          .ToList();

                // double matching is fuzzy, but matching strings is enough for this test
                // ground truth was computed using Spark
                Assert.Equal(26706.6185312147, seq.Sum(p => p.v), 5);
                Assert.Equal(0.808287234987281, seq.Average(p => p.v), 5);
                Assert.Equal(0.71523915461624, seq.Min(p => p.v), 5);
                Assert.Equal(0.867111980015206, seq.Max(p => p.v), 5);
            }
        }
        public List <DataEntity> ParseFileSchema(DataContainer container, DataCollection collection, Stream fileStream, out long rowCount)
        {
            var entities = new List <DataEntity>();

            var options = new ParquetOptions {
                TreatByteArrayAsString = true
            };
            var reader = new ParquetReader(fileStream, options);

            var schema = reader.Schema;
            var fields = schema.GetDataFields();

            foreach (var field in fields)
            {
                entities.Add(new DataEntity(field.Name, ConvertDataType(field.DataType),
                                            Enum.GetName(typeof(Parquet.Data.DataType), field.DataType), container, collection));
            }

            rowCount = 0;
            for (int i = 0; i < reader.RowGroupCount; i++)
            {
                var columns = reader.ReadEntireRowGroup(i);
                rowCount += columns[0].Data.Length;
            }

            return(entities);
        }
Ejemplo n.º 4
0
 public void Read_hardcoded_decimal()
 {
     using (var reader = new ParquetReader(OpenTestFile("complex-primitives.parquet")))
     {
         decimal value = (decimal)reader.ReadEntireRowGroup()[1].Data.GetValue(0);
         Assert.Equal((decimal)1.2, value);
     }
 }
Ejemplo n.º 5
0
        private static decimal[] ReadFile(string filename)
        {
            using var stream        = File.OpenRead(filename);
            using var parquetReader = new ParquetReader(stream);
            var results = parquetReader.ReadEntireRowGroup();

            return((decimal[])results[0].Data);
        }
 public void FixedLenByteArray_dictionary()
 {
     using (Stream s = OpenTestFile("fixedlenbytearray.parquet"))
     {
         using (var r = new ParquetReader(s))
         {
             DataColumn[] columns = r.ReadEntireRowGroup();
         }
     }
 }
Ejemplo n.º 7
0
        public void Read_simple_map()
        {
            using (var reader = new ParquetReader(OpenTestFile("map_simple.parquet"), leaveStreamOpen: false))
            {
                DataColumn[] data = reader.ReadEntireRowGroup();

                Assert.Equal(new int?[] { 1 }, data[0].Data);
                Assert.Equal(new int[] { 1, 2, 3 }, data[1].Data);
                Assert.Equal(new string[] { "one", "two", "three" }, data[2].Data);
            }
        }
Ejemplo n.º 8
0
        public DataColumn[] ParquetDotNet()
        {
            using var stream        = File.OpenRead(Filename);
            using var parquetReader = new ParquetReader(stream);
            var results = parquetReader.ReadEntireRowGroup();

            if (Check.Enabled)
            {
                Check.ArraysAreEqual(_values, (decimal[])results[0].Data);
            }

            return(results);
        }
Ejemplo n.º 9
0
 public void ParquetReader_EmptyColumn()
 {
     using (var reader = new ParquetReader(OpenTestFile("emptycolumn.parquet"), leaveStreamOpen: false))
     {
         DataColumn[] columns = reader.ReadEntireRowGroup();
         int?[]       col0    = (int?[])columns[0].Data;
         Assert.Equal(10, col0.Length);
         foreach (int?value in col0)
         {
             Assert.Null(value);
         }
     }
 }
Ejemplo n.º 10
0
        /// <summary>
        ///		Lee un registro
        /// </summary>
        public bool Read()
        {
            bool readed = false;

            // Recorre los grupos de filas del archivo
            if (_groupRowColumns == null || _actualRow >= _groupRowColumns[0].Data.Length)
            {
                // Obtiene el lector con el grupo de filas
                if (_rowGroup < _parquetReader.RowGroupCount)
                {
                    _groupRowColumns = _parquetReader.ReadEntireRowGroup(_rowGroup).ToArray();
                }
                else
                {
                    _groupRowColumns = null;
                }
                // Incrementa el número de grupo y cambia la fila actual
                _rowGroup++;
                _actualRow = 0;
            }
            // Obtiene los datos (si queda algo por leer)
            if (_groupRowColumns != null)
            {
                // Transforma las columnas
                _rowValues = new List <object>();
                foreach (DataColumn column in _groupRowColumns)
                {
                    object value = column.Data.GetValue(_actualRow);

                    // Parquet almacena las fechas como DateTimeOffset y se debe convertir a un dateTime
                    if (value is DateTimeOffset date)
                    {
                        value = ConvertFromDateTimeOffset(date);
                    }
                    // Añade el valor
                    _rowValues.Add(value);
                }
                // Indica que se ha leido el registro e incrementa la fila actual
                readed = true;
                _actualRow++;
                // Incrementa la fila total y lanza el evento
                _row++;
                if (_row % NotifyAfter == 0)
                {
                    RaiseEventReadBlock(_row);
                }
            }
            // Devuelve el valor que indica si se ha leido un registro
            return(readed);
        }
Ejemplo n.º 11
0
        public void Read_bit_packed_at_page_boundary()
        {
            using (var reader = new ParquetReader(OpenTestFile("/special/multi_page_bit_packed_near_page_border.parquet")))
            {
                DataColumn[] columns = reader.ReadEntireRowGroup();
                string[]     data    = (string[])columns[0].Data;

                // ground truth from spark
                Assert.Equal(30855, data.Count(string.IsNullOrEmpty));
                // check page boundary
                Assert.Equal("collateral_natixis_fr_vol5010", data[60355]);
                Assert.Equal("BRAZ82595832_vol16239", data[60356]);
            }
        }
Ejemplo n.º 12
0
        public void Reads_byte_arrays()
        {
            byte[] nameValue;
            byte[] expectedValue = Encoding.UTF8.GetBytes("ALGERIA");
            using (var reader = new ParquetReader(OpenTestFile(@"real/nation.plain.parquet"), leaveStreamOpen: false))
            {
                DataColumn[] data = reader.ReadEntireRowGroup();

                byte[][] nameColumn = (byte[][])data[1].Data;
                nameValue = nameColumn[0];
                Assert.Equal <IEnumerable <byte> >(expectedValue, nameValue);
            }
            Assert.Equal <IEnumerable <byte> >(expectedValue, nameValue);
        }
Ejemplo n.º 13
0
        public DataColumn[] ParquetDotNet()
        {
            using var stream        = File.OpenRead(Filename);
            using var parquetReader = new ParquetReader(stream);
            var results = parquetReader.ReadEntireRowGroup();

            if (Check.Enabled)
            {
                Check.ArraysAreEqual(_allDatesAsDateTimeOffsets, (DateTimeOffset[])results[0].Data);
                Check.ArraysAreEqual(_allObjectIds, (int[])results[1].Data);
                Check.ArraysAreEqual(_allValues, (float[])results[2].Data);
            }

            return(results);
        }
        public void Datetypes_all()
        {
            DateTimeOffset offset, offset2;

            using (Stream s = OpenTestFile("dates.parquet"))
            {
                using (var r = new ParquetReader(s))
                {
                    DataColumn[] columns = r.ReadEntireRowGroup();

                    offset  = (DateTimeOffset)(columns[1].Data.GetValue(0));
                    offset2 = (DateTimeOffset)(columns[1].Data.GetValue(1));
                }
            }
            Assert.Equal(new DateTime(2017, 1, 1), offset.Date);
            Assert.Equal(new DateTime(2017, 2, 1), offset2.Date);
        }
Ejemplo n.º 15
0
        public void ReadLargeTimestampData()
        {
            using (var reader = new ParquetReader(OpenTestFile("/mixed-dictionary-plain.parquet"), leaveStreamOpen: false))
            {
                DataColumn[] columns = reader.ReadEntireRowGroup();

                DateTimeOffset?[] col0 = (DateTimeOffset?[])columns[0].Data;
                Assert.Equal(440773, col0.Length);

                long ticks = col0[0].Value.Ticks;
                for (int i = 1; i < 132000; i++)
                {
                    long now = col0[i].Value.Ticks;
                    Assert.NotEqual(ticks, now);
                }
            }
        }
        public void DateTime_FromOtherSystem()
        {
            DateTimeOffset offset;

            using (Stream s = OpenTestFile("datetime_other_system.parquet"))
            {
                using (var r = new ParquetReader(s))
                {
                    DataColumn[] columns = r.ReadEntireRowGroup();

                    DataColumn as_at_date_col = columns.FirstOrDefault(x => x.Field.Name == "as_at_date_");
                    Assert.NotNull(as_at_date_col);

                    offset = (DateTimeOffset)(as_at_date_col.Data.GetValue(0));
                    Assert.Equal(new DateTime(2018, 12, 14, 0, 0, 0), offset.Date);
                }
            }
        }
Ejemplo n.º 17
0
        public void Reads_multi_page_file()
        {
            using (var reader = new ParquetReader(OpenTestFile("multi.page.parquet"), leaveStreamOpen: false))
            {
                DataColumn[] data = reader.ReadEntireRowGroup();
                Assert.Equal(927861, data[0].Data.Length);

                int[] firstColumn = (int[])data[0].Data;
                Assert.Equal(30763, firstColumn[524286]);
                Assert.Equal(30766, firstColumn[524287]);

                //At row 524288 the data is split into another page
                //The column makes use of a dictionary to reduce the number of values and the default dictionary index value is zero (i.e. the first record value)
                Assert.NotEqual(firstColumn[0], firstColumn[524288]);

                //The value should be 30768
                Assert.Equal(30768, firstColumn[524288]);
            }
        }
Ejemplo n.º 18
0
        public void Read_multi_page_dictionary_with_nulls()
        {
            using (var reader = new ParquetReader(OpenTestFile("/special/multi_page_dictionary_with_nulls.parquet")))
            {
                DataColumn[] columns = reader.ReadEntireRowGroup();
                var          rg      = reader.OpenRowGroupReader(0);

                // reading columns
                var data = (string[])columns[0].Data;

                // ground truth from spark
                // check page boundary (first page contains 107432 rows)
                Assert.Equal("xc3w4eudww", data[107432]);
                Assert.Equal("bpywp4wtwk", data[107433]);
                Assert.Equal("z6x8652rle", data[107434]);

                // check near the end of the file
                Assert.Null(data[310028]);
                Assert.Equal("wok86kie6c", data[310029]);
                Assert.Equal("le9i7kbbib", data[310030]);
            }
        }
        public List <string> CollectSamples(DataContainer container, DataCollection collection, DataEntity entity, int entityIndex, Stream fileStream, int maxSamples, double probability)
        {
            var result = new List <string>();
            var rand   = new Random();

            var options = new ParquetOptions {
                TreatByteArrayAsString = true
            };
            var reader = new ParquetReader(fileStream, options);

            for (int i = 0; i < reader.RowGroupCount; i++)
            {
                var columns = reader.ReadEntireRowGroup(i);

                var column = columns.FirstOrDefault(x => x.Field.Name.Equals(entity.Name));
                if (column != null)
                {
                    for (int j = 0; j < column.Data.Length; j++)
                    {
                        if (rand.NextDouble() < probability)
                        {
                            result.Add(column.Data.GetValue(j)?.ToString());
                            if (result.Count >= maxSamples)
                            {
                                break;
                            }
                        }
                    }
                }

                if (result.Count >= maxSamples)
                {
                    break;
                }
            }

            return(result);
        }
        public static void TestReadFloatTimeSeries([Values(0, 1, 2, 3, 5)] int warmup)
        {
            var timer = Stopwatch.StartNew();

            Console.WriteLine("Generating data...");

            var(dates, objectIds, values, numRows) = CreateFloatDataFrame(3600);

            Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", numRows, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet");

            timer.Restart();

            const string filename = "float_timeseries.parquet";

            using (var fileWriter = new ParquetFileWriter(filename, CreateFloatColumns(), Compression.Snappy))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();

                using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray());
                    }
                }

                using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        objectIdWriter.WriteBatch(objectIds);
                    }
                }

                using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        valueWriter.WriteBatch(values[i]);
                    }
                }

                fileWriter.Close();
            }


            var fileLength = new FileInfo(filename).Length;

            Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", fileLength, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Reading from Parquet");

            timer.Restart();

            using (var fileReader = new ParquetFileReader(filename))
            {
                using var groupReader = fileReader.RowGroup(0);

                using (var dateTimeReader = groupReader.Column(0).LogicalReader <DateTime>())
                {
                    dateTimeReader.ReadAll(numRows);
                }

                using (var objectIdReader = groupReader.Column(1).LogicalReader <int>())
                {
                    objectIdReader.ReadAll(numRows);
                }

                using (var valueReader = groupReader.Column(2).LogicalReader <float>())
                {
                    valueReader.ReadAll(numRows);
                }
            }

            Console.WriteLine("Read Parquet ({0:N0} bytes) in {1:N3} sec", fileLength, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Reading from Parquet (Parquet.NET)");

            timer.Restart();

            using (var stream = File.OpenRead(filename))
            {
                using var parquetReader = new ParquetReader(stream);
                parquetReader.ReadEntireRowGroup();
            }

            Console.WriteLine("Read Parquet (Parquet.NET {0:N0} bytes) in {1:N3} sec", fileLength, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
        }