public static PSchema InferParquetSchema(this JObject jObject) { var schemaExtractor = new JsonSchemaInferring(); PSchema schema = schemaExtractor.InferSchema(new[] { jObject }); return(schema); }
public long ParquetDotNet() { { var valueField = new DecimalDataField("Value", precision: 29, scale: 3, hasNulls: false); var schema = new Parquet.Data.Schema(valueField); using var stream = File.Create("decimal_timeseries.parquet.net"); using var parquetWriter = new ParquetWriter(schema, stream); using var groupWriter = parquetWriter.CreateRowGroup(); groupWriter.WriteColumn(new DataColumn(valueField, _values)); } if (Check.Enabled) { // Read content from ParquetSharp and Parquet.NET var baseline = ReadFile("decimal_timeseries.parquet"); var results = ReadFile("decimal_timeseries.parquet.net"); // Prove that the content is the same Check.ArraysAreEqual(_values, baseline); Check.ArraysAreEqual(baseline, results); } return(new FileInfo("decimal_timeseries.parquet.net").Length); }
public static void TestDecimalSeries([Values(0, 1)] int warmup) { var timer = Stopwatch.StartNew(); var rand = new Random(123); Console.WriteLine("Generating data..."); var values = Enumerable.Range(0, 10_000_000).Select(i => { var n = rand.Next(); var sign = rand.NextDouble() < 0.5 ? -1M : +1M; return(sign * ((decimal)n * n * n) / 1000M); }).ToArray(); Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", values.Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet"); timer.Restart(); using (var fileWriter = new ParquetFileWriter("decimal_timeseries.parquet", new Column[] { new Column <decimal>("Value", LogicalType.Decimal(precision: 29, scale: 3)) })) { using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <decimal>(); valueWriter.WriteBatch(values); } fileWriter.Close(); } Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", new FileInfo("decimal_timeseries.parquet").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.NET"); timer.Restart(); { var valueField = new DecimalDataField("Value", precision: 29, scale: 3); var schema = new Parquet.Data.Schema(valueField); using var stream = File.Create("decimal_timeseries.parquet.net"); using var parquetWriter = new ParquetWriter(schema, stream); using var groupWriter = parquetWriter.CreateRowGroup(); groupWriter.WriteColumn(new DataColumn(valueField, values)); } Console.WriteLine("Saved to Parquet.NET ({0:N0} bytes) in {1:N2} sec", new FileInfo("decimal_timeseries.parquet.net").Length, timer.Elapsed.TotalSeconds); }
public static DataSet ToParquetDataSet(this JObject jObject, PSchema schema) { if (schema == null) { throw new ArgumentNullException(nameof(schema)); } //convert data var dataExtractor = new JsonDataExtractor(schema); var ds = new DataSet(schema); dataExtractor.AddRow(ds, jObject); return(ds); }
public static DataSet ToParquetDataSet(this JObject jObject) { //extract schema var schemaExtractor = new JsonSchemaExtractor(); schemaExtractor.Analyze(jObject); PSchema schema = schemaExtractor.GetSchema(); //convert data var dataExtractor = new JsonDataExtractor(schema); var ds = new DataSet(schema); dataExtractor.AddRow(ds, jObject); return(ds); }
public static void TestWriteFloatTimeSeries([Values(0, 1)] int warmup) { var timer = Stopwatch.StartNew(); Console.WriteLine("Generating data..."); var(dates, objectIds, values, numRows) = CreateFloatDataFrame(); Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", numRows, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to CSV"); timer.Restart(); using (var csv = new StreamWriter("float_timeseries.csv")) { for (int i = 0; i != dates.Length; ++i) { for (int j = 0; j != objectIds.Length; ++j) { csv.WriteLine("{0:yyyy-MM-dd HH:mm:ss},{1},{2}", dates[i], objectIds[j], values[i][j]); } } } Console.WriteLine("Saved to CSV ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.csv").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to CSV.GZ"); timer.Restart(); using (var stream = new FileStream("float_timeseries.csv.gz", FileMode.Create)) { using var zip = new GZipStream(stream, CompressionLevel.Optimal); using var csv = new StreamWriter(zip); for (int i = 0; i != dates.Length; ++i) { for (int j = 0; j != objectIds.Length; ++j) { csv.WriteLine("{0:yyyy-MM-dd HH:mm:ss},{1},{2}", dates[i], objectIds[j], values[i][j]); } } } Console.WriteLine("Saved to CSV ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.csv.gz").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet"); timer.Restart(); using (var fileWriter = new ParquetFileWriter("float_timeseries.parquet", CreateFloatColumns())) { using var rowGroupWriter = fileWriter.AppendRowGroup(); using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { for (int i = 0; i != dates.Length; ++i) { dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray()); } } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { for (int i = 0; i != dates.Length; ++i) { objectIdWriter.WriteBatch(objectIds); } } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { for (int i = 0; i != dates.Length; ++i) { valueWriter.WriteBatch(values[i]); } } fileWriter.Close(); } Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.Chunked (by date)"); timer.Restart(); using (var fileWriter = new ParquetFileWriter("float_timeseries.parquet.chunked", CreateFloatColumns())) { for (int i = 0; i != dates.Length; ++i) { using var rowGroupWriter = fileWriter.AppendRowGroup(); using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray()); } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { objectIdWriter.WriteBatch(objectIds); } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { valueWriter.WriteBatch(values[i]); } } fileWriter.Close(); } Console.WriteLine("Saved to Parquet.Chunked ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.chunked").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.RowOriented"); timer.Restart(); using (var rowWriter = ParquetFile.CreateRowWriter <(DateTime, int, float)>("float_timeseries.parquet.roworiented", new[] { "DateTime", "ObjectId", "Value" })) { for (int i = 0; i != dates.Length; ++i) { for (int j = 0; j != objectIds.Length; ++j) { rowWriter.WriteRow((dates[i], objectIds[j], values[i][j])); } } } Console.WriteLine("Saved to Parquet.RowOriented ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.roworiented").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.Stream"); timer.Restart(); using (var stream = new FileStream("float_timeseries.parquet.stream", FileMode.Create)) { using var writer = new IO.ManagedOutputStream(stream); using var fileWriter = new ParquetFileWriter(writer, CreateFloatColumns()); using var rowGroupWriter = fileWriter.AppendRowGroup(); using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { for (int i = 0; i != dates.Length; ++i) { dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray()); } } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { for (int i = 0; i != dates.Length; ++i) { objectIdWriter.WriteBatch(objectIds); } } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { for (int i = 0; i != dates.Length; ++i) { valueWriter.WriteBatch(values[i]); } } fileWriter.Close(); } Console.WriteLine("Saved to Parquet.Stream ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.stream").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.Chunked.Stream (by date)"); timer.Restart(); using (var stream = new FileStream("float_timeseries.parquet.chunked.stream", FileMode.Create)) { using var writer = new IO.ManagedOutputStream(stream); using var fileWriter = new ParquetFileWriter(writer, CreateFloatColumns()); for (int i = 0; i != dates.Length; ++i) { using var rowGroupWriter = fileWriter.AppendRowGroup(); using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray()); } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { objectIdWriter.WriteBatch(objectIds); } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { valueWriter.WriteBatch(values[i]); } } fileWriter.Close(); } Console.WriteLine("Saved to Parquet.Chunked.Stream ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.chunked.stream").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.RowOriented.Stream"); timer.Restart(); using (var stream = new FileStream("float_timeseries.parquet.roworiented.stream", FileMode.Create)) { using var writer = new IO.ManagedOutputStream(stream); using var rowWriter = ParquetFile.CreateRowWriter <(DateTime, int, float)>(writer, new[] { "DateTime", "ObjectId", "Value" }); for (int i = 0; i != dates.Length; ++i) { for (int j = 0; j != objectIds.Length; ++j) { rowWriter.WriteRow((dates[i], objectIds[j], values[i][j])); } } rowWriter.Close(); } Console.WriteLine("Saved to Parquet.RowOriented.Stream ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.roworiented.stream").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.NET"); timer.Restart(); { var dateTimeField = new DateTimeDataField("DateTime", DateTimeFormat.DateAndTime); var objectIdField = new DataField <int>("ObjectId"); var valueField = new DataField <float>("Value"); var schema = new Parquet.Data.Schema(dateTimeField, objectIdField, valueField); using (var stream = File.Create("float_timeseries.parquet.net")) using (var parquetWriter = new ParquetWriter(schema, stream)) using (var groupWriter = parquetWriter.CreateRowGroup()) { var dateTimeColumn = new DataColumn(dateTimeField, dates.SelectMany(d => Enumerable.Repeat(new DateTimeOffset(d), objectIds.Length)).ToArray()); var objectIdColumn = new DataColumn(objectIdField, dates.SelectMany(d => objectIds).ToArray()); var valueColumn = new DataColumn(valueField, dates.SelectMany((d, i) => values[i]).ToArray()); groupWriter.WriteColumn(dateTimeColumn); groupWriter.WriteColumn(objectIdColumn); groupWriter.WriteColumn(valueColumn); } } Console.WriteLine("Saved to Parquet.NET ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.net").Length, timer.Elapsed.TotalSeconds); }
public static void TestFloatTimeSeries() { var timer = Stopwatch.StartNew(); var rand = new Random(123); Console.WriteLine("Generating data..."); var dates = Enumerable.Range(0, 360)//*24*12) .Select(i => new DateTime(2001, 01, 01) + TimeSpan.FromHours(i)) .Where(d => d.DayOfWeek != DayOfWeek.Saturday && d.DayOfWeek != DayOfWeek.Sunday) .ToArray(); var objectIds = Enumerable.Range(0, 10000) .Select(i => rand.Next()) .Distinct() .OrderBy(i => i) .ToArray(); var values = dates.Select(d => objectIds.Select(o => (float)rand.NextDouble()).ToArray()).ToArray(); Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", values.Select(v => v.Length).Aggregate(0, (sum, l) => sum + l), timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to CSV"); timer.Restart(); using (var csv = new StreamWriter("float_timeseries.csv")) { for (int i = 0; i != dates.Length; ++i) { for (int j = 0; j != objectIds.Length; ++j) { csv.WriteLine("{0:yyyy-MM-dd HH:mm:ss},{1},{2}", dates[i], objectIds[j], values[i][j]); } } } Console.WriteLine("Saved to CSV ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.csv").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to CSV.GZ"); timer.Restart(); using (var stream = new FileStream("float_timeseries.csv.gz", FileMode.Create)) using (var zip = new GZipStream(stream, CompressionLevel.Optimal)) using (var csv = new StreamWriter(zip)) { for (int i = 0; i != dates.Length; ++i) { for (int j = 0; j != objectIds.Length; ++j) { csv.WriteLine("{0:yyyy-MM-dd HH:mm:ss},{1},{2}", dates[i], objectIds[j], values[i][j]); } } } Console.WriteLine("Saved to CSV ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.csv.gz").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet"); timer.Restart(); using (var fileWriter = new ParquetFileWriter("float_timeseries.parquet", CreateFloatColumns())) using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { for (int i = 0; i != dates.Length; ++i) { dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray()); } } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { for (int i = 0; i != dates.Length; ++i) { objectIdWriter.WriteBatch(objectIds); } } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { for (int i = 0; i != dates.Length; ++i) { valueWriter.WriteBatch(values[i]); } } } Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.Chunked (by date)"); timer.Restart(); using (var fileWriter = new ParquetFileWriter("float_timeseries.parquet.chunked", CreateFloatColumns())) { for (int i = 0; i != dates.Length; ++i) { using (var rowGroupWriter = fileWriter.AppendRowGroup()) { using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>()) { dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray()); } using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>()) { objectIdWriter.WriteBatch(objectIds); } using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>()) { valueWriter.WriteBatch(values[i]); } } } } Console.WriteLine("Saved to Parquet.Chunked ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.chunked").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.RowOriented"); timer.Restart(); using (var rowWriter = ParquetFile.CreateRowWriter <(DateTime, int, float)>("float_timeseries.parquet.roworiented", new[] { "DateTime", "ObjectId", "Value" })) { for (int i = 0; i != dates.Length; ++i) { for (int j = 0; j != objectIds.Length; ++j) { rowWriter.WriteRow((dates[i], objectIds[j], values[i][j])); } } } Console.WriteLine("Saved to Parquet.RowOriented ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.roworiented").Length, timer.Elapsed.TotalSeconds); Console.WriteLine(); Console.WriteLine("Saving to Parquet.NET"); timer.Restart(); { var dateTimeField = new DateTimeDataField("DateTime", DateTimeFormat.DateAndTime); var objectIdField = new DataField <int>("ObjectId"); var valueField = new DataField <float>("Value"); var schema = new Parquet.Data.Schema(dateTimeField, objectIdField, valueField); using (var stream = File.Create("float_timeseries.parquet.net")) using (var parquetWriter = new ParquetWriter(schema, stream)) using (var groupWriter = parquetWriter.CreateRowGroup()) { var dateTimeColumn = new DataColumn(dateTimeField, dates.SelectMany(d => Enumerable.Repeat(new DateTimeOffset(d), objectIds.Length)).ToArray()); var objectIdColumn = new DataColumn(objectIdField, dates.SelectMany(d => objectIds).ToArray()); var valueColumn = new DataColumn(valueField, dates.SelectMany((d, i) => values[i]).ToArray()); groupWriter.WriteColumn(dateTimeColumn); groupWriter.WriteColumn(objectIdColumn); groupWriter.WriteColumn(valueColumn); } } Console.WriteLine("Saved to Parquet.NET ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.net").Length, timer.Elapsed.TotalSeconds); }