Beispiel #1
0
        private void ReadDataBackgroundWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //Parquet.NET doesn't have any async methods or readers that allow sequential records reading so we need to use the ThreadPool to support cancellation.
            var task = Task <ParquetReadResult> .Run(() =>
            {
                //Unfortunately there's no way to quickly get the metadata from a parquet file without reading an actual data row
                //BUG: Parquet.NET doesn't always respect the Count parameter, sometimes returning more than the passed value...
                using (var parquetReader = ParquetReader.OpenFromFile(this.OpenFilePath, new ParquetOptions()
                {
                    TreatByteArrayAsString = true
                }))
                {
                    int totalRowCount = 0;
                    DataTable result  = UtilityMethods.ParquetReaderToDataTable(parquetReader, out totalRowCount, this.SelectedFields, this.CurrentOffset, this.CurrentMaxRowCount);
                    return(new ParquetReadResult(result, totalRowCount));
                }
            });

            while (!task.IsCompleted && !((BackgroundWorker)sender).CancellationPending)
            {
                task.Wait(1000);
            }

            e.Cancel = ((BackgroundWorker)sender).CancellationPending;

            if (task.IsCompleted)
            {
                e.Result = task.Result;
            }
        }
Beispiel #2
0
        public static void TestAgainstThirdParty()
        {
            var columns = new Column[] { new Column <decimal>("Decimal", LogicalType.Decimal(precision: 29, scale: 3)) };
            var values  = Enumerable.Range(0, 10_000)
                          .Select(i => ((decimal)i * i * i) / 1000 - 10)
                          .Concat(new [] { decimal.MinValue / 1000, decimal.MaxValue / 1000 })
                          .ToArray();

            using var buffer = new ResizableBuffer();

            // Write using ParquetSharp
            using (var outStream = new BufferOutputStream(buffer))
            {
                using var fileWriter     = new ParquetFileWriter(outStream, columns, Compression.Snappy);
                using var rowGroupWriter = fileWriter.AppendRowGroup();
                using var columnWriter   = rowGroupWriter.NextColumn().LogicalWriter <decimal>();

                columnWriter.WriteBatch(values);

                fileWriter.Close();
            }

            // Read using Parquet.NET
            using var memoryStream   = new MemoryStream(buffer.ToArray());
            using var fileReader     = new ParquetReader(memoryStream);
            using var rowGroupReader = fileReader.OpenRowGroupReader(0);

            var read = (decimal[])rowGroupReader.ReadColumn(fileReader.Schema.GetDataFields()[0]).Data;

            Assert.AreEqual(values, read);
        }
Beispiel #3
0
        public void FileMetadata_sets_num_rows_on_file_and_row_group_multiple_row_groups()
        {
            var ms = new MemoryStream();
            var id = new DataField <int>("id");

            //write
            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new[] { 1, 2, 3, 4 }));
                }

                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new[] { 5, 6 }));
                }
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(6, reader.ThriftMetadata.Num_rows);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(4, rg.RowCount);
                }

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1))
                {
                    Assert.Equal(2, rg.RowCount);
                }
            }
        }
Beispiel #4
0
        public override IEnumerable <object> AsEnumerable(object source, Func <object, bool?> filterFunc = null)
        {
            if (source == null)
            {
                yield break;
            }

            ParquetReader sr = source as ParquetReader;

            ChoGuard.ArgumentNotNull(sr, "ParquetReader");

            InitializeRecordConfiguration(Configuration);

            if (!RaiseBeginLoad(sr))
            {
                yield break;
            }

            if (InterceptRowGroup)
            {
                foreach (var item in AsEnumerable(ReadObjectsByRowGroup(sr).SelectMany(i => i.Select(i1 => i1)), TraceSwitch, filterFunc))
                {
                    yield return(item);
                }
            }
            else
            {
                foreach (var item in AsEnumerable(ReadAllObjects(sr), TraceSwitch, filterFunc))
                {
                    yield return(item);
                }
            }

            RaiseEndLoad(sr);
        }
Beispiel #5
0
        public void List_of_elements_with_some_items_empty_reads_file()
        {
            /*
             * list data:
             * - 1: [1, 2, 3]
             * - 2: []
             * - 3: [1, 2, 3]
             * - 4: []
             */

            using (var reader = new ParquetReader(OpenTestFile("listofitems-empty-alternates.parquet")))
            {
                using (ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(4, groupReader.RowCount);
                    DataField[] fs = reader.Schema.GetDataFields();

                    DataColumn id = groupReader.ReadColumn(fs[0]);
                    Assert.Equal(4, id.Data.Length);
                    Assert.False(id.HasRepetitions);

                    DataColumn list = groupReader.ReadColumn(fs[1]);
                    Assert.Equal(8, list.Data.Length);
                    Assert.Equal(new int[] { 0, 1, 1, 0, 0, 1, 1, 0 }, list.RepetitionLevels);
                }
            }
        }
        public void Special_read_file_with_multiple_row_groups()
        {
            var ms = new MemoryStream();

            //create multirowgroup file

            //first row group
            var t = new Table(new DataField <int>("id"));

            t.Add(1);
            t.Add(2);
            using (var writer = new ParquetWriter(t.Schema, ms))
            {
                writer.Write(t);
            }

            //second row group
            t.Clear();
            t.Add(3);
            t.Add(4);
            using (var writer = new ParquetWriter(t.Schema, ms, null, true))
            {
                writer.Write(t);
            }

            //read back as table
            t = ParquetReader.ReadTableFromStream(ms);
            Assert.Equal(4, t.Count);
        }
Beispiel #7
0
        public void Read_simple_repeated_field()
        {
            /*
             * root
             |-- cities: array (nullable = true)
             |    |-- element: string (containsNull = true)
             |-- id: long (nullable = true)
             */

            DataSet ds = ParquetReader.ReadFile(GetDataFilePath("simplerepeated.parquet"));

            Assert.Equal(2, ds.Schema.Length);
            Assert.Equal(typeof(IEnumerable <string>), ds.Schema[0].ColumnType);
            Assert.Equal(typeof(string), ds.Schema[0].ElementType);
            Assert.Equal(typeof(long), ds.Schema[1].ElementType);

            Assert.Equal("cities", ds.Schema[0].Name);
            Assert.Equal("id", ds.Schema[1].Name);

            Assert.True(ds.Schema[0].IsRepeated);
            Assert.False(ds.Schema[1].IsRepeated);

            Assert.Equal(1L, ds[0][1]);
            Assert.Equal(ds[0][0], new[] { "London", "Derby", "Paris", "New York" });
        }
Beispiel #8
0
        public override IEnumerable <object> AsEnumerable(object source, Func <object, bool?> filterFunc = null)
        {
            if (source == null)
            {
                yield break;
            }

            ParquetReader sr = source as ParquetReader;

            ChoGuard.ArgumentNotNull(sr, "ParquetReader");

            InitializeRecordConfiguration(Configuration);

            if (!RaiseBeginLoad(sr))
            {
                yield break;
            }

            foreach (var item in AsEnumerable(ReadObjects(sr), TraceSwitch, filterFunc))
            {
                yield return(item);
            }

            RaiseEndLoad(sr);
        }
Beispiel #9
0
        public IEnumerator <T> GetEnumerator()
        {
            CheckDisposed();
            ChoParquetRecordReader rr = new ChoParquetRecordReader(typeof(T), Configuration);

            if (_streamReader != null)
            {
                _parquetReader = Create(_streamReader.Value);
            }

            rr.Reader                     = this;
            rr.TraceSwitch                = TraceSwitch;
            rr.RowsLoaded                += NotifyRowsLoaded;
            rr.BeforeRowGroupLoad        += BeforeRowGroupLoad;
            rr.AfterRowGroupLoaded       += AfterRowGroupLoaded;
            rr.MembersDiscovered         += MembersDiscovered;
            rr.RecordFieldTypeAssessment += RecordFieldTypeAssessment;
            var beforeRowGroup = BeforeRowGroupLoad;
            var afterRowGroup  = AfterRowGroupLoaded;

            if (beforeRowGroup != null || afterRowGroup != null)
            {
                rr.InterceptRowGroup = true;
            }

            var e = rr.AsEnumerable(_parquetReader).GetEnumerator();

            return(ChoEnumeratorWrapper.BuildEnumerable <T>(() => {
                ++_recordNumber;
                return e.MoveNext();
            }, () => (T)ChoConvert.ChangeType <ChoRecordFieldAttribute>(e.Current, typeof(T)), () => Dispose()).GetEnumerator());
        }
        private void CompareWithMr(Table t)
        {
            string testFileName = Path.GetFullPath("temp.parquet");

            if (F.Exists(testFileName))
            {
                F.Delete(testFileName);
            }

            //produce file
            using (Stream s = F.OpenWrite(testFileName))
            {
                using (var writer = new ParquetWriter(t.Schema, s))
                {
                    writer.Write(t);
                }
            }

            //read back
            Table t2 = ParquetReader.ReadTableFromFile(testFileName);

            //check we don't have a bug internally before launching MR
            Assert.Equal(t.ToString("j"), t2.ToString("j"), ignoreLineEndingDifferences: true);

            string mrJson = ExecAndGetOutput(_javaExecName, $"-jar {_toolsJarPath} cat -j {testFileName}");

            Assert.Equal(t.ToString("j"), mrJson);
        }
        public void Read_multiple_data_pages()
        {
            using (var reader =
                       new ParquetReader(OpenTestFile("/special/multi_data_page.parquet"), leaveStreamOpen: false))
            {
                DataColumn[] columns = reader.ReadEntireRowGroup();

                string[]  s = (string[])columns[0].Data;
                double?[] d = (double?[])columns[1].Data;

                // check for nulls (issue #370)
                for (int i = 0; i < s.Length; i++)
                {
                    Assert.True(s[i] != null, "found null in s at " + i);
                    Assert.True(d[i] != null, "found null in d at " + i);
                }

                // run aggregations checking row alignment (issue #371)
                var seq = s.Zip(d.Cast <double>(), (w, v) => new { w, v })
                          .Where(p => p.w == "general")
                          .ToList();

                // double matching is fuzzy, but matching strings is enough for this test
                Assert.Equal("0.754359925788497", seq.Min(p => p.v).ToString(CultureInfo.InvariantCulture));
                Assert.Equal("0.85776", seq.Max(p => p.v).ToString(CultureInfo.InvariantCulture));
            }
        }
        public void Write_and_read_nullable_integers()
        {
            var ds = new DataSet(new DataField <int?>("id"))
            {
                1,
                2,
                3,
                (object)null,
                4,
                (object)null,
                5
            };
            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(1, ds1[0].GetInt(0));
            Assert.Equal(2, ds1[1].GetInt(0));
            Assert.Equal(3, ds1[2].GetInt(0));
            Assert.True(ds1[3].IsNullAt(0));
            Assert.Equal(4, ds1[4].GetInt(0));
            Assert.True(ds1[5].IsNullAt(0));
            Assert.Equal(5, ds1[6].GetInt(0));
        }
Beispiel #13
0
        public void Reads_really_mad_nested_file()
        {
            /* Spark schema:
             * root
             |-- addresses: array (nullable = true)
             |    |-- element: struct (containsNull = true)
             |    |    |-- line1: string (nullable = true)
             |    |    |-- name: string (nullable = true)
             |    |    |-- openingHours: array (nullable = true)
             |    |    |    |-- element: long (containsNull = true)
             |    |    |-- postcode: string (nullable = true)
             |-- cities: array (nullable = true)
             |    |-- element: string (containsNull = true)
             |-- comment: string (nullable = true)
             |-- id: long (nullable = true)
             |-- location: struct (nullable = true)
             |    |-- latitude: double (nullable = true)
             |    |-- longitude: double (nullable = true)
             |-- price: struct (nullable = true)
             |    |-- lunch: struct (nullable = true)
             |    |    |-- max: long (nullable = true)
             |    |    |-- min: long (nullable = true)
             */


            DataSet ds = ParquetReader.Read(OpenTestFile("nested.parquet"));

            //much easier to compare mad nestness with .ToString(), but will break when it changes
            Assert.Equal("{[{Dante Road;Head Office;[9;10;11;12;13;14;15;16;17;18];SE11};{Somewhere Else;Small Office;[6;7;19;20;21;22;23];TN19}];[London;Derby];this file contains all the permunations for nested structures and arrays to test Parquet parser;1;{51.2;66.3};{{2;1}}}", ds[0].ToString());
            Assert.Equal("{[{Dante Road;Head Office;[9;10;11;12;13;14;15;16;17;18];SE11};{Somewhere Else;Small Office;[6;7;19;20;21;22;23];TN19}];[London;Derby];this file contains all the permunations for nested structures and arrays to test Parquet parser;1;{51.2;66.3};{{2;1}}}", ds[1].ToString());
        }
        public void Type_writes_and_reads_end_to_end(SchemaElement schema, object value, string name = null)
        {
            var ds = new DataSet(schema)
            {
                new Row(value)
            };
            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            object expectedValue = ds[0][0];
            object actualValue   = ds1[0][0];

            if (schema.ElementType == typeof(DateTime))
            {
                actualValue = ((DateTimeOffset)actualValue).DateTime;
            }

            Assert.True(expectedValue.Equals(actualValue),
                        $"{name}| expected: {expectedValue}, actual: {actualValue}, schema element: {schema}");

            //if (schema.ElementType == typeof(decimal)) ParquetWriter.WriteFile(ds1, "c:\\tmp\\decimals.parquet");
        }
        public void List_simple_element_write_read()
        {
            var table = new Table(
                new Schema(
                    new DataField <int>("id"),
                    new ListField("cities",
                                  new DataField <string>("name"))));

            var ms = new MemoryStream();

            table.Add(1, new[] { "London", "Derby" });
            table.Add(2, new[] { "Paris", "New York" });

            //write as table
            using (var writer = new ParquetWriter(table.Schema, ms))
            {
                writer.Write(table);
            }

            //read back into table
            ms.Position = 0;
            Table table2;

            using (var reader = new ParquetReader(ms))
            {
                table2 = reader.ReadAsTable();
            }

            //validate data
            Assert.Equal(table.ToString(), table2.ToString(), ignoreLineEndingDifferences: true);
        }
Beispiel #16
0
        public static async Task <DataSet> LoadAsync(StorageFile file, int offset = 0, int count = 100)
        {
            using (IRandomAccessStreamWithContentType uwpStream = await file.OpenReadAsync())
            {
                using (Stream stream = uwpStream.AsStreamForRead())
                {
                    var readerOptions = new ReaderOptions()
                    {
                        Offset = offset,
                        Count  = count
                    };

                    var formatOptions = new ParquetOptions
                    {
                        TreatByteArrayAsString = true
                    };

                    try
                    {
                        return(ParquetReader.Read(stream, formatOptions, readerOptions));
                    }
                    catch (Exception ex)
                    {
                        var dialog = new MessageDialog(ex.Message, "Cannot open file");
                        await dialog.ShowAsync();

                        return(null);
                    }
                }
            }
        }
        public void Flat_write_read()
        {
            var table = new Table(new Schema(new DataField <int>("id"), new DataField <string>("city")));
            var ms    = new MemoryStream();

            //generate fake data
            for (int i = 0; i < 1000; i++)
            {
                table.Add(new Row(i, "record#" + i));
            }

            //write to stream
            using (var writer = new ParquetWriter(table.Schema, ms))
            {
                writer.Write(table);
            }

            //read back into table
            ms.Position = 0;
            Table table2;

            using (var reader = new ParquetReader(ms))
            {
                table2 = reader.ReadAsTable();
            }

            //validate data
            Assert.True(table.Equals(table2, true));
        }
Beispiel #18
0
        public void Write_and_read_nullable_integers()
        {
            var ds = new DataSet(new SchemaElement <int>("id"))
            {
                1,
                2,
                3,
                (object)null,
                4,
                (object)null,
                5
            };
            var ms = new MemoryStream();

            ParquetWriter.Write(ds, ms);

            ms.Position = 0;
            DataSet ds1 = ParquetReader.Read(ms);

            Assert.Equal(ds1[0].GetInt(0), 1);
            Assert.Equal(ds1[1].GetInt(0), 2);
            Assert.Equal(ds1[2].GetInt(0), 3);
            Assert.True(ds1[3].IsNullAt(0));
            Assert.Equal(ds1[4].GetInt(0), 4);
            Assert.True(ds1[5].IsNullAt(0));
            Assert.Equal(ds1[6].GetInt(0), 5);
        }
        public void Array_write_read()
        {
            var table = new Table(
                new Schema(
                    new DataField <int>("id"),
                    new DataField <string[]>("categories") //array field
                    )
                );
            var ms = new MemoryStream();

            table.Add(1, new[] { "1", "2", "3" });
            table.Add(3, new[] { "3", "3", "3" });

            //write to stream
            using (var writer = new ParquetWriter(table.Schema, ms))
            {
                writer.Write(table);
            }

            //System.IO.File.WriteAllBytes("c:\\tmp\\1.parquet", ms.ToArray());

            //read back into table
            ms.Position = 0;
            Table table2;

            using (var reader = new ParquetReader(ms))
            {
                table2 = reader.ReadAsTable();
            }

            //validate data
            Assert.Equal(table.ToString(), table2.ToString(), ignoreLineEndingDifferences: true);
        }
Beispiel #20
0
 public ParquetDataReader(ParquetReader parquetReader, IEnumerable <DataField> fields, IEnumerable <ColumnConverter> columnConverters)
 {
     _parquetReader    = parquetReader ?? throw new ArgumentNullException(nameof(parquetReader));
     _fields           = fields?.ToList() ?? throw new ArgumentNullException(nameof(fields));
     _rowGroupCount    = parquetReader.RowGroupCount;
     _columnConverters = new Lazy <List <Func <object, object> > >(() => InitializeColumnConverters(columnConverters), true);
 }
Beispiel #21
0
        public void Read_simple_nested_field()
        {
            /*
             * root
             |-- city: struct (nullable = true)
             |    |-- country: string (nullable = true)
             |    |-- isCapital: boolean (nullable = true)
             |    |-- name: string (nullable = true)
             |-- id: long (nullable = true)
             */

            //Assert.Throws<NotSupportedException>(() => ParquetReader.ReadFile(GetDataFilePath("simplenested.parquet")));
            //return;

            DataSet ds = ParquetReader.ReadFile(GetDataFilePath("simplenested.parquet"));

            Assert.Equal(1, ds.RowCount);
            Assert.Equal(2, ds.ColumnCount);

            Assert.Equal(typeof(Row), ds.Schema[0].ElementType);
            Assert.Equal(typeof(long), ds.Schema[1].ElementType);

            Assert.Equal("city", ds.Schema.ColumnNames[0]);
            Assert.Equal("id", ds.Schema.ColumnNames[1]);

            Row mr = ds[0];

            Row city = mr.Get <Row>(0);

            Assert.Equal("United Kingdom", city[0]);
            Assert.True((bool)city[1]);
            Assert.Equal("London", city[2]);

            Assert.Equal(1L, mr[1]);
        }
Beispiel #22
0
        public static Dictionary <int, string> ReadParquetFile(string infile)
        {
            Dictionary <int, string> serializedRequests = new Dictionary <int, string>();

            string path = Path.GetFullPath(Directory.GetCurrentDirectory() + "/" + infile);

            using (Stream fileStream = File.OpenRead(path))
            {
                using (var parquetReader = new ParquetReader(fileStream))
                {
                    DataField[] dataFields = parquetReader.Schema.GetDataFields();

                    using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(0))
                    {
                        DataColumn[] columns      = dataFields.Select(groupReader.ReadColumn).ToArray();
                        DataColumn   firstColumn  = columns[0];
                        DataColumn   secondColumn = columns[1];

                        Array idData      = firstColumn.Data;
                        Array requestData = secondColumn.Data;

                        for (var j = 0; j < firstColumn.Data.Length; j++)
                        {
                            var convertedRequestData = (string)requestData.GetValue(j);
                            var convertedIdData      = (int)idData.GetValue(j);
                            serializedRequests.Add(convertedIdData, convertedRequestData);
                        }
                    }

                    return(serializedRequests);
                }
            }
        }
Beispiel #23
0
        public void Read_multiple_data_pages()
        {
            using (var reader =
                       new ParquetReader(OpenTestFile("/special/multi_data_page.parquet"), leaveStreamOpen: false))
            {
                DataColumn[] columns = reader.ReadEntireRowGroup();

                string[]  s = (string[])columns[0].Data;
                double?[] d = (double?[])columns[1].Data;

                // check for nulls (issue #370)
                for (int i = 0; i < s.Length; i++)
                {
                    Assert.True(s[i] != null, "found null in s at " + i);
                    Assert.True(d[i] != null, "found null in d at " + i);
                }

                // run aggregations checking row alignment (issue #371)
                var seq = s.Zip(d.Cast <double>(), (w, v) => new { w, v })
                          .Where(p => p.w == "favorable")
                          .ToList();

                // double matching is fuzzy, but matching strings is enough for this test
                // ground truth was computed using Spark
                Assert.Equal(26706.6185312147, seq.Sum(p => p.v), 5);
                Assert.Equal(0.808287234987281, seq.Average(p => p.v), 5);
                Assert.Equal(0.71523915461624, seq.Min(p => p.v), 5);
                Assert.Equal(0.867111980015206, seq.Max(p => p.v), 5);
            }
        }
        public void All_compression_methods_supported(CompressionMethod compressionMethod)
        {
            //v2
            var     ms  = new MemoryStream();
            DataSet ds1 = new DataSet(new DataField <int>("id"));
            DataSet ds2;

            ds1.Add(5);

            //write
            using (var writer = new ParquetWriter(ms))
            {
                writer.Write(ds1, CompressionMethod.Gzip);
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                ms.Position = 0;
                ds2         = reader.Read();
            }

            Assert.Equal(5, ds2[0].GetInt(0));

            //v3
            const int value  = 5;
            object    actual = WriteReadSingle(new DataField <int>("id"), value, compressionMethod);

            Assert.Equal(5, (int)actual);
        }
Beispiel #25
0
            protected override bool MoveNextCore()
            {
                if (_dataSetEnumerator.MoveNext())
                {
                    _curDataSetRow = _dataSetEnumerator.Current;
                    return(true);
                }
                else if (_blockEnumerator.MoveNext())
                {
                    _readerOptions.Offset = (long)_blockEnumerator.Current * _readerOptions.Count;

                    // When current dataset runs out, read the next portion of the parquet file.
                    DataSet ds;
                    lock (_loader._parquetStream)
                    {
                        ds = ParquetReader.Read(_loader._parquetStream, _loader._parquetOptions, _readerOptions);
                    }

                    var dataSetOrder = CreateOrderSequence(ds.RowCount);
                    _dataSetEnumerator = dataSetOrder.GetEnumerator();
                    _curDataSetRow     = dataSetOrder.ElementAt(0);

                    // Cache list for each active column
                    for (int i = 0; i < _actives.Length; i++)
                    {
                        Column col = _loader._columnsLoaded[_actives[i]];
                        _columnValues[i] = ds.GetColumn(col.DataField);
                    }

                    return(_dataSetEnumerator.MoveNext());
                }
                return(false);
            }
Beispiel #26
0
        public void ReadIntro()
        {
            // open file stream
            using (Stream fileStream = System.IO.File.OpenRead("c:\\test.parquet"))
            {
                // open parquet file reader
                using (var parquetReader = new ParquetReader(fileStream))
                {
                    // get file schema (available straight after opening parquet reader)
                    // however, get only data fields as only they contain data values
                    DataField[] dataFields = parquetReader.Schema.GetDataFields();

                    // enumerate through row groups in this file
                    for (int i = 0; i < parquetReader.RowGroupCount; i++)
                    {
                        // create row group reader
                        using (ParquetRowGroupReader groupReader = parquetReader.OpenRowGroupReader(i))
                        {
                            // read all columns inside each row group (you have an option to read only
                            // required columns if you need to.
                            DataColumn[] columns = dataFields.Select(groupReader.ReadColumn).ToArray();

                            // get first column, for instance
                            DataColumn firstColumn = columns[0];

                            // .Data member contains a typed array of column data you can cast to the type of the column
                            Array data = firstColumn.Data;
                            int[] ids  = (int[])data;
                        }
                    }
                }
            }
        }
Beispiel #27
0
        public void Write_read_nullable_column(Array input)
        {
            var id = new DataField <int?>("id");
            var ms = new MemoryStream();

            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, input));
                }
            }

            ms.Position = 0;
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(1, reader.RowGroupCount);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(input.Length, rg.RowCount);
                    Assert.Equal(input, rg.ReadColumn(id).Data);
                }
            }
        }
Beispiel #28
0
      public void I_can_write_snappy_and_read_back()
      {
         var ms = new MemoryStream();
         var ds1 = new DataSet(
            new DataField<int>("id"),
            new DataField<int>("no"));

         ds1.Add(1, 3);
         ds1.Add(2, 4);

         DataSet ds2;

         //write
         using (var writer = new ParquetWriter(ms))
         {
            writer.Write(ds1, CompressionMethod.Snappy);
         }

         //read back
         using (var reader = new ParquetReader(ms))
         {
            ms.Position = 0;
            ds2 = reader.Read();
         }

         Assert.Equal(1, ds2[0].GetInt(0));
         Assert.Equal(2, ds2[1].GetInt(0));
         Assert.Equal(3, ds2[0].GetInt(1));
         Assert.Equal(4, ds2[1].GetInt(1));
      }
Beispiel #29
0
        public void CustomMetadata_can_write_and_read()
        {
            var ms = new MemoryStream();
            var id = new DataField <int>("id");

            //write
            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                writer.CustomMetadata = new Dictionary <string, string>
                {
                    ["key1"] = "value1",
                    ["key2"] = "value2"
                };

                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new[] { 1, 2, 3, 4 }));
                }
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal("value1", reader.CustomMetadata["key1"]);
                Assert.Equal("value2", reader.CustomMetadata["key2"]);
            }
        }
 protected Thrift.FileMetaData ReadInternalMetadata()
 {
     using (var reader = ParquetReader.OpenFromFile(_path))
     {
         return(reader.ThriftMetadata);
     }
 }