Beispiel #1
0
        private void ReadDataBackgroundWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //Parquet.NET doesn't have any async methods or readers that allow sequential records reading so we need to use the ThreadPool to support cancellation.
            var task = Task <ParquetReadResult> .Run(() =>
            {
                //Unfortunately there's no way to quickly get the metadata from a parquet file without reading an actual data row
                //BUG: Parquet.NET doesn't always respect the Count parameter, sometimes returning more than the passed value...
                using (var parquetReader = ParquetReader.OpenFromFile(this.OpenFilePath, new ParquetOptions()
                {
                    TreatByteArrayAsString = true
                }))
                {
                    int totalRowCount = 0;
                    DataTable result  = UtilityMethods.ParquetReaderToDataTable(parquetReader, out totalRowCount, this.SelectedFields, this.CurrentOffset, this.CurrentMaxRowCount);
                    return(new ParquetReadResult(result, totalRowCount));
                }
            });

            while (!task.IsCompleted && !((BackgroundWorker)sender).CancellationPending)
            {
                task.Wait(1000);
            }

            e.Cancel = ((BackgroundWorker)sender).CancellationPending;

            if (task.IsCompleted)
            {
                e.Result = task.Result;
            }
        }
 protected Thrift.FileMetaData ReadInternalMetadata()
 {
     using (var reader = ParquetReader.OpenFromFile(_path))
     {
         return(reader.ThriftMetadata);
     }
 }
Beispiel #3
0
        public void Execute()
        {
            Telemetry.CommandExecuted("schema",
                                      "path", _path);

            using (var reader = ParquetReader.OpenFromFile(_path))
            {
                Schema schema = reader.Schema;

                PrintSchema(schema);
            }
        }
        public void Execute()
        {
            Telemetry.CommandExecuted("schema",
                                      "path", _path);

            using (var time = new TimeMeasure())
            {
                using (var reader = ParquetReader.OpenFromFile(_path))
                {
                    Schema schema = reader.Schema;

                    PrintSchema(schema, time.Elapsed);
                }
            }
        }
Beispiel #5
0
        private static void ReadPerf()
        {
            using (var reader = ParquetReader.OpenFromFile(@"C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet", new ParquetOptions {
                TreatByteArrayAsString = true
            }))
            {
                var cl = new List <DataColumn>();

                using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0))
                {
                    foreach (DataField field in reader.Schema.GetDataFields())
                    {
                        DataColumn dataColumn = rgr.ReadColumn(field);
                        cl.Add(dataColumn);
                    }
                }
            }
        }
Beispiel #6
0
        private static void ReadLargeFile()
        {
            using (var reader = ParquetReader.OpenFromFile(@"C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet", new ParquetOptions {
                TreatByteArrayAsString = true
            }))
            {
                log.Trace("row groups: {0}", reader.RowGroupCount);

                using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0))
                {
                    foreach (DataField field in reader.Schema.GetDataFields())
                    {
                        DataColumn dataColumn = rgr.ReadColumn(field);

                        log.Trace("col {0}, values: {1}", field, dataColumn.Data.Length);
                    }
                }
            }
        }
Beispiel #7
0
        public void ParquetReader_OpenFromFile_Close_Stream()
        {
            // copy a file to a temp location
            string tempFile = Path.GetTempFileName();

            using (Stream fr = OpenTestFile("map_simple.parquet"))
                using (FileStream fw = System.IO.File.OpenWrite(tempFile))
                {
                    fr.CopyTo(fw);
                }

            // open the copy
            using (var reader = ParquetReader.OpenFromFile(tempFile))
            {
                // do nothing
            }

            // now try to delete this temp file. If the stream is properly closed, this should succeed
            System.IO.File.Delete(tempFile);
        }
        protected Table ReadTable(int maxRows = 10)
        {
            using (var msg = new ProgressMessage($"reading file ({maxRows} rows min)..."))
            {
                try
                {
                    using (var reader = ParquetReader.OpenFromFile(_path, new ParquetOptions {
                        TreatByteArrayAsString = true
                    }))
                    {
                        Table table = reader.ReadAsTable();

                        return(table);
                    }
                }
                catch (Exception ex)
                {
                    msg.Fail(ex.Message);
                    throw;
                }
            }
        }
Beispiel #9
0
        private void FileSchemaBackgroundWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            var schema = (Parquet.Data.Schema)e.Argument;

            if (schema != null)
            {
                e.Result = schema;
            }
            else
            {
                //Parquet.NET doesn't have any async methods or readers that allow sequential reading so we need to use the ThreadPool to support cancellation.
                var task = Task <ParquetReader> .Run(() =>
                {
                    //Unfortunately there's no way to quickly get the metadata from a parquet file without reading an actual data row
                    using (var parquetReader = ParquetReader.OpenFromFile(this.OpenFilePath, new ParquetOptions()
                    {
                        TreatByteArrayAsString = true
                    }))
                    {
                        return(parquetReader.Schema);
                    }
                });

                while (!task.IsCompleted && !((BackgroundWorker)sender).CancellationPending)
                {
                    task.Wait(1000);
                }

                e.Cancel = ((BackgroundWorker)sender).CancellationPending;

                if (task.IsCompleted)
                {
                    e.Result = task.Result;
                }
            }
        }
Beispiel #10
0
        private void ReadDataBackgroundWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //Parquet.NET doesn't have any async methods or readers that allow sequential records reading so we need to use the ThreadPool to support cancellation.
            Task task              = null;
            var  results           = new ConcurrentDictionary <int, ParquetReadResult>();
            var  cancellationToken = new System.Threading.CancellationTokenSource();

            if (AppSettings.ReadingEngine == ParquetEngine.Default)
            {
                task = Task.Run(() =>
                {
                    using (var parquetReader = ParquetReader.OpenFromFile(this.OpenFilePath, new ParquetOptions()
                    {
                        TreatByteArrayAsString = true
                    }))
                    {
                        DataTable result = UtilityMethods.ParquetReaderToDataTable(parquetReader, this.SelectedFields, this.CurrentOffset, this.CurrentMaxRowCount, cancellationToken.Token);
                        results.TryAdd(1, new ParquetReadResult(result, parquetReader.ThriftMetadata.Num_rows));
                    }
                });
            }
            else
            {
                int i           = 0;
                var fieldGroups = new List <(int, List <string>)>();
                foreach (List <string> fields in UtilityMethods.Split(this.SelectedFields, (int)(this.selectedFields.Count / Environment.ProcessorCount)))
                {
                    fieldGroups.Add((i++, fields));
                }

                task = ParallelAsync.ForeachAsync(fieldGroups, Environment.ProcessorCount,
                                                  async fieldGroup =>
                {
                    await Task.Run(() =>
                    {
                        using (Stream parquetStream = new FileStream(this.OpenFilePath, FileMode.Open, FileAccess.Read))
                            using (var parquetReader = new ParquetReader(parquetStream, new ParquetOptions()
                            {
                                TreatByteArrayAsString = true
                            }))
                            {
                                DataTable result = UtilityMethods.ParquetReaderToDataTable(parquetReader, fieldGroup.Item2, this.CurrentOffset, this.CurrentMaxRowCount, cancellationToken.Token);
                                results.TryAdd(fieldGroup.Item1, new ParquetReadResult(result, parquetReader.ThriftMetadata.Num_rows));
                            }
                    });
                });
            }

            while (!task.IsCompleted && !((BackgroundWorker)sender).CancellationPending)
            {
                task.Wait(1000);
            }

            if (((BackgroundWorker)sender).CancellationPending)
            {
                cancellationToken.Cancel();
                e.Cancel = true;
            }

            if (task.IsCompleted)
            {
                if (results.Count > 0)
                {
                    DataTable         mergedDataTables = UtilityMethods.MergeTables(results.OrderBy(f => f.Key).Select(f => f.Value.Result).AsEnumerable());
                    ParquetReadResult finalResult      = new ParquetReadResult(mergedDataTables, results.First().Value.TotalNumberOfRecordsInFile);
                    e.Result = finalResult;
                }
                else
                {
                    //The code should never reach here
                    e.Result = new ParquetReadResult(new DataTable(), 0);
                }
            }
        }
Beispiel #11
0
        private static void ReadLargeFile(out TimeSpan readTime,
                                          out TimeSpan uncompressedWriteTime,
                                          out TimeSpan gzipWriteTime)
        {
            Schema schema;

            DataColumn[] columns;

            using (var time = new TimeMeasure())
            {
                using (var reader = ParquetReader.OpenFromFile(@"C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet", new ParquetOptions {
                    TreatByteArrayAsString = true
                }))
                {
                    schema = reader.Schema;
                    var cl = new List <DataColumn>();

                    using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0))
                    {
                        foreach (DataField field in reader.Schema.GetDataFields())
                        {
                            DataColumn dataColumn = rgr.ReadColumn(field);
                            cl.Add(dataColumn);
                        }
                    }
                    columns = cl.ToArray();
                }
                readTime = time.Elapsed;
            }

            using (FileStream dest = F.OpenWrite("perf.uncompressed.parquet"))
            {
                using (var time = new TimeMeasure())
                {
                    using (var writer = new ParquetWriter(schema, dest))
                    {
                        writer.CompressionMethod = CompressionMethod.None;
                        using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                        {
                            foreach (DataColumn dc in columns)
                            {
                                rg.WriteColumn(dc);
                            }
                        }
                    }

                    uncompressedWriteTime = time.Elapsed;
                }
            }


            using (FileStream dest = F.OpenWrite("perf.gzip.parquet"))
            {
                using (var time = new TimeMeasure())
                {
                    using (var writer = new ParquetWriter(schema, dest))
                    {
                        writer.CompressionMethod = CompressionMethod.Gzip;
                        using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                        {
                            foreach (DataColumn dc in columns)
                            {
                                rg.WriteColumn(dc);
                            }
                        }
                    }

                    gzipWriteTime = time.Elapsed;
                }
            }
        }