private void ReadDataBackgroundWorker_DoWork(object sender, DoWorkEventArgs e) { //Parquet.NET doesn't have any async methods or readers that allow sequential records reading so we need to use the ThreadPool to support cancellation. var task = Task <ParquetReadResult> .Run(() => { //Unfortunately there's no way to quickly get the metadata from a parquet file without reading an actual data row //BUG: Parquet.NET doesn't always respect the Count parameter, sometimes returning more than the passed value... using (var parquetReader = ParquetReader.OpenFromFile(this.OpenFilePath, new ParquetOptions() { TreatByteArrayAsString = true })) { int totalRowCount = 0; DataTable result = UtilityMethods.ParquetReaderToDataTable(parquetReader, out totalRowCount, this.SelectedFields, this.CurrentOffset, this.CurrentMaxRowCount); return(new ParquetReadResult(result, totalRowCount)); } }); while (!task.IsCompleted && !((BackgroundWorker)sender).CancellationPending) { task.Wait(1000); } e.Cancel = ((BackgroundWorker)sender).CancellationPending; if (task.IsCompleted) { e.Result = task.Result; } }
protected Thrift.FileMetaData ReadInternalMetadata() { using (var reader = ParquetReader.OpenFromFile(_path)) { return(reader.ThriftMetadata); } }
public void Execute() { Telemetry.CommandExecuted("schema", "path", _path); using (var reader = ParquetReader.OpenFromFile(_path)) { Schema schema = reader.Schema; PrintSchema(schema); } }
public void Execute() { Telemetry.CommandExecuted("schema", "path", _path); using (var time = new TimeMeasure()) { using (var reader = ParquetReader.OpenFromFile(_path)) { Schema schema = reader.Schema; PrintSchema(schema, time.Elapsed); } } }
private static void ReadPerf() { using (var reader = ParquetReader.OpenFromFile(@"C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet", new ParquetOptions { TreatByteArrayAsString = true })) { var cl = new List <DataColumn>(); using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0)) { foreach (DataField field in reader.Schema.GetDataFields()) { DataColumn dataColumn = rgr.ReadColumn(field); cl.Add(dataColumn); } } } }
private static void ReadLargeFile() { using (var reader = ParquetReader.OpenFromFile(@"C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet", new ParquetOptions { TreatByteArrayAsString = true })) { log.Trace("row groups: {0}", reader.RowGroupCount); using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0)) { foreach (DataField field in reader.Schema.GetDataFields()) { DataColumn dataColumn = rgr.ReadColumn(field); log.Trace("col {0}, values: {1}", field, dataColumn.Data.Length); } } } }
public void ParquetReader_OpenFromFile_Close_Stream() { // copy a file to a temp location string tempFile = Path.GetTempFileName(); using (Stream fr = OpenTestFile("map_simple.parquet")) using (FileStream fw = System.IO.File.OpenWrite(tempFile)) { fr.CopyTo(fw); } // open the copy using (var reader = ParquetReader.OpenFromFile(tempFile)) { // do nothing } // now try to delete this temp file. If the stream is properly closed, this should succeed System.IO.File.Delete(tempFile); }
protected Table ReadTable(int maxRows = 10) { using (var msg = new ProgressMessage($"reading file ({maxRows} rows min)...")) { try { using (var reader = ParquetReader.OpenFromFile(_path, new ParquetOptions { TreatByteArrayAsString = true })) { Table table = reader.ReadAsTable(); return(table); } } catch (Exception ex) { msg.Fail(ex.Message); throw; } } }
private void FileSchemaBackgroundWorker_DoWork(object sender, DoWorkEventArgs e) { var schema = (Parquet.Data.Schema)e.Argument; if (schema != null) { e.Result = schema; } else { //Parquet.NET doesn't have any async methods or readers that allow sequential reading so we need to use the ThreadPool to support cancellation. var task = Task <ParquetReader> .Run(() => { //Unfortunately there's no way to quickly get the metadata from a parquet file without reading an actual data row using (var parquetReader = ParquetReader.OpenFromFile(this.OpenFilePath, new ParquetOptions() { TreatByteArrayAsString = true })) { return(parquetReader.Schema); } }); while (!task.IsCompleted && !((BackgroundWorker)sender).CancellationPending) { task.Wait(1000); } e.Cancel = ((BackgroundWorker)sender).CancellationPending; if (task.IsCompleted) { e.Result = task.Result; } } }
private void ReadDataBackgroundWorker_DoWork(object sender, DoWorkEventArgs e) { //Parquet.NET doesn't have any async methods or readers that allow sequential records reading so we need to use the ThreadPool to support cancellation. Task task = null; var results = new ConcurrentDictionary <int, ParquetReadResult>(); var cancellationToken = new System.Threading.CancellationTokenSource(); if (AppSettings.ReadingEngine == ParquetEngine.Default) { task = Task.Run(() => { using (var parquetReader = ParquetReader.OpenFromFile(this.OpenFilePath, new ParquetOptions() { TreatByteArrayAsString = true })) { DataTable result = UtilityMethods.ParquetReaderToDataTable(parquetReader, this.SelectedFields, this.CurrentOffset, this.CurrentMaxRowCount, cancellationToken.Token); results.TryAdd(1, new ParquetReadResult(result, parquetReader.ThriftMetadata.Num_rows)); } }); } else { int i = 0; var fieldGroups = new List <(int, List <string>)>(); foreach (List <string> fields in UtilityMethods.Split(this.SelectedFields, (int)(this.selectedFields.Count / Environment.ProcessorCount))) { fieldGroups.Add((i++, fields)); } task = ParallelAsync.ForeachAsync(fieldGroups, Environment.ProcessorCount, async fieldGroup => { await Task.Run(() => { using (Stream parquetStream = new FileStream(this.OpenFilePath, FileMode.Open, FileAccess.Read)) using (var parquetReader = new ParquetReader(parquetStream, new ParquetOptions() { TreatByteArrayAsString = true })) { DataTable result = UtilityMethods.ParquetReaderToDataTable(parquetReader, fieldGroup.Item2, this.CurrentOffset, this.CurrentMaxRowCount, cancellationToken.Token); results.TryAdd(fieldGroup.Item1, new ParquetReadResult(result, parquetReader.ThriftMetadata.Num_rows)); } }); }); } while (!task.IsCompleted && !((BackgroundWorker)sender).CancellationPending) { task.Wait(1000); } if (((BackgroundWorker)sender).CancellationPending) { cancellationToken.Cancel(); e.Cancel = true; } if (task.IsCompleted) { if (results.Count > 0) { DataTable mergedDataTables = UtilityMethods.MergeTables(results.OrderBy(f => f.Key).Select(f => f.Value.Result).AsEnumerable()); ParquetReadResult finalResult = new ParquetReadResult(mergedDataTables, results.First().Value.TotalNumberOfRecordsInFile); e.Result = finalResult; } else { //The code should never reach here e.Result = new ParquetReadResult(new DataTable(), 0); } } }
private static void ReadLargeFile(out TimeSpan readTime, out TimeSpan uncompressedWriteTime, out TimeSpan gzipWriteTime) { Schema schema; DataColumn[] columns; using (var time = new TimeMeasure()) { using (var reader = ParquetReader.OpenFromFile(@"C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet", new ParquetOptions { TreatByteArrayAsString = true })) { schema = reader.Schema; var cl = new List <DataColumn>(); using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0)) { foreach (DataField field in reader.Schema.GetDataFields()) { DataColumn dataColumn = rgr.ReadColumn(field); cl.Add(dataColumn); } } columns = cl.ToArray(); } readTime = time.Elapsed; } using (FileStream dest = F.OpenWrite("perf.uncompressed.parquet")) { using (var time = new TimeMeasure()) { using (var writer = new ParquetWriter(schema, dest)) { writer.CompressionMethod = CompressionMethod.None; using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { foreach (DataColumn dc in columns) { rg.WriteColumn(dc); } } } uncompressedWriteTime = time.Elapsed; } } using (FileStream dest = F.OpenWrite("perf.gzip.parquet")) { using (var time = new TimeMeasure()) { using (var writer = new ParquetWriter(schema, dest)) { writer.CompressionMethod = CompressionMethod.Gzip; using (ParquetRowGroupWriter rg = writer.CreateRowGroup()) { foreach (DataColumn dc in columns) { rg.WriteColumn(dc); } } } gzipWriteTime = time.Elapsed; } } }