Exemple #1
0
        private Field GetField(PropertyInfo property)
        {
            Type pt = property.PropertyType;

            if (pt.IsNullable())
            {
                pt = pt.GetNonNullable();
            }
            if (pt.IsArray)
            {
                pt = pt.GetElementType();
            }

            IDataTypeHandler handler = DataTypeFactory.Match(pt);

            if (handler == null)
            {
                return(null);
            }

            ParquetColumnAttribute columnAttr = property.GetCustomAttribute <ParquetColumnAttribute>();

            string   name = columnAttr?.Name ?? property.Name;
            DataType type = handler.DataType;

            var r = new DataField(name,
                                  property.PropertyType //use CLR type here as DF constructor will figure out nullability and other parameters
                                  );

            if (columnAttr != null)
            {
                if (handler.ClrType == typeof(TimeSpan))
                {
                    r = new TimeSpanDataField(r.Name, columnAttr.TimeSpanFormat, r.HasNulls, r.IsArray);
                }
                if (handler.ClrType == typeof(DateTime) || handler.ClrType == typeof(DateTimeOffset))
                {
                    r = new DateTimeDataField(r.Name, columnAttr.DateTimeFormat, r.HasNulls, r.IsArray);
                }
                if (handler.ClrType == typeof(decimal))
                {
                    r = new DecimalDataField(r.Name, columnAttr.DecimalPrecision, columnAttr.DecimalScale, columnAttr.DecimalForceByteArrayEncoding, r.HasNulls, r.IsArray);
                }
            }

            r.ClrPropName = property.Name;

            return(r);
        }
        public static void TestWriteFloatTimeSeries([Values(0, 1)] int warmup)
        {
            var timer = Stopwatch.StartNew();

            Console.WriteLine("Generating data...");

            var(dates, objectIds, values, numRows) = CreateFloatDataFrame();

            Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", numRows, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to CSV");

            timer.Restart();

            using (var csv = new StreamWriter("float_timeseries.csv"))
            {
                for (int i = 0; i != dates.Length; ++i)
                {
                    for (int j = 0; j != objectIds.Length; ++j)
                    {
                        csv.WriteLine("{0:yyyy-MM-dd HH:mm:ss},{1},{2}", dates[i], objectIds[j], values[i][j]);
                    }
                }
            }

            Console.WriteLine("Saved to CSV ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.csv").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to CSV.GZ");

            timer.Restart();

            using (var stream = new FileStream("float_timeseries.csv.gz", FileMode.Create))
            {
                using var zip = new GZipStream(stream, CompressionLevel.Optimal);
                using var csv = new StreamWriter(zip);

                for (int i = 0; i != dates.Length; ++i)
                {
                    for (int j = 0; j != objectIds.Length; ++j)
                    {
                        csv.WriteLine("{0:yyyy-MM-dd HH:mm:ss},{1},{2}", dates[i], objectIds[j], values[i][j]);
                    }
                }
            }

            Console.WriteLine("Saved to CSV ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.csv.gz").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet");

            timer.Restart();

            using (var fileWriter = new ParquetFileWriter("float_timeseries.parquet", CreateFloatColumns()))
            {
                using var rowGroupWriter = fileWriter.AppendRowGroup();

                using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray());
                    }
                }

                using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        objectIdWriter.WriteBatch(objectIds);
                    }
                }

                using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        valueWriter.WriteBatch(values[i]);
                    }
                }

                fileWriter.Close();
            }

            Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.Chunked (by date)");

            timer.Restart();

            using (var fileWriter = new ParquetFileWriter("float_timeseries.parquet.chunked", CreateFloatColumns()))
            {
                for (int i = 0; i != dates.Length; ++i)
                {
                    using var rowGroupWriter = fileWriter.AppendRowGroup();

                    using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                    {
                        dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray());
                    }

                    using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                    {
                        objectIdWriter.WriteBatch(objectIds);
                    }

                    using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                    {
                        valueWriter.WriteBatch(values[i]);
                    }
                }

                fileWriter.Close();
            }

            Console.WriteLine("Saved to Parquet.Chunked ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.chunked").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.RowOriented");

            timer.Restart();

            using (var rowWriter = ParquetFile.CreateRowWriter <(DateTime, int, float)>("float_timeseries.parquet.roworiented", new[] { "DateTime", "ObjectId", "Value" }))
            {
                for (int i = 0; i != dates.Length; ++i)
                {
                    for (int j = 0; j != objectIds.Length; ++j)
                    {
                        rowWriter.WriteRow((dates[i], objectIds[j], values[i][j]));
                    }
                }
            }

            Console.WriteLine("Saved to Parquet.RowOriented ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.roworiented").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.Stream");

            timer.Restart();

            using (var stream = new FileStream("float_timeseries.parquet.stream", FileMode.Create))
            {
                using var writer         = new IO.ManagedOutputStream(stream);
                using var fileWriter     = new ParquetFileWriter(writer, CreateFloatColumns());
                using var rowGroupWriter = fileWriter.AppendRowGroup();

                using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray());
                    }
                }

                using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        objectIdWriter.WriteBatch(objectIds);
                    }
                }

                using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                {
                    for (int i = 0; i != dates.Length; ++i)
                    {
                        valueWriter.WriteBatch(values[i]);
                    }
                }


                fileWriter.Close();
            }

            Console.WriteLine("Saved to Parquet.Stream ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.stream").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.Chunked.Stream (by date)");

            timer.Restart();

            using (var stream = new FileStream("float_timeseries.parquet.chunked.stream", FileMode.Create))
            {
                using var writer     = new IO.ManagedOutputStream(stream);
                using var fileWriter = new ParquetFileWriter(writer, CreateFloatColumns());

                for (int i = 0; i != dates.Length; ++i)
                {
                    using var rowGroupWriter = fileWriter.AppendRowGroup();

                    using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                    {
                        dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray());
                    }

                    using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                    {
                        objectIdWriter.WriteBatch(objectIds);
                    }

                    using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                    {
                        valueWriter.WriteBatch(values[i]);
                    }
                }

                fileWriter.Close();
            }

            Console.WriteLine("Saved to Parquet.Chunked.Stream ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.chunked.stream").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.RowOriented.Stream");

            timer.Restart();

            using (var stream = new FileStream("float_timeseries.parquet.roworiented.stream", FileMode.Create))
            {
                using var writer    = new IO.ManagedOutputStream(stream);
                using var rowWriter = ParquetFile.CreateRowWriter <(DateTime, int, float)>(writer, new[] { "DateTime", "ObjectId", "Value" });

                for (int i = 0; i != dates.Length; ++i)
                {
                    for (int j = 0; j != objectIds.Length; ++j)
                    {
                        rowWriter.WriteRow((dates[i], objectIds[j], values[i][j]));
                    }
                }

                rowWriter.Close();
            }

            Console.WriteLine("Saved to Parquet.RowOriented.Stream ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.roworiented.stream").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.NET");

            timer.Restart();

            {
                var dateTimeField = new DateTimeDataField("DateTime", DateTimeFormat.DateAndTime);
                var objectIdField = new DataField <int>("ObjectId");
                var valueField    = new DataField <float>("Value");
                var schema        = new Parquet.Data.Schema(dateTimeField, objectIdField, valueField);

                using (var stream = File.Create("float_timeseries.parquet.net"))
                    using (var parquetWriter = new ParquetWriter(schema, stream))
                        using (var groupWriter = parquetWriter.CreateRowGroup())
                        {
                            var dateTimeColumn = new DataColumn(dateTimeField,
                                                                dates.SelectMany(d => Enumerable.Repeat(new DateTimeOffset(d), objectIds.Length)).ToArray());

                            var objectIdColumn = new DataColumn(objectIdField,
                                                                dates.SelectMany(d => objectIds).ToArray());

                            var valueColumn = new DataColumn(valueField,
                                                             dates.SelectMany((d, i) => values[i]).ToArray());

                            groupWriter.WriteColumn(dateTimeColumn);
                            groupWriter.WriteColumn(objectIdColumn);
                            groupWriter.WriteColumn(valueColumn);
                        }
            }

            Console.WriteLine("Saved to Parquet.NET ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.net").Length, timer.Elapsed.TotalSeconds);
        }
        public static void TestFloatTimeSeries()
        {
            var timer = Stopwatch.StartNew();
            var rand  = new Random(123);

            Console.WriteLine("Generating data...");

            var dates = Enumerable.Range(0, 360)//*24*12)
                        .Select(i => new DateTime(2001, 01, 01) + TimeSpan.FromHours(i))
                        .Where(d => d.DayOfWeek != DayOfWeek.Saturday && d.DayOfWeek != DayOfWeek.Sunday)
                        .ToArray();

            var objectIds = Enumerable.Range(0, 10000)
                            .Select(i => rand.Next())
                            .Distinct()
                            .OrderBy(i => i)
                            .ToArray();

            var values = dates.Select(d => objectIds.Select(o => (float)rand.NextDouble()).ToArray()).ToArray();

            Console.WriteLine("Generated {0:N0} rows in {1:N2} sec", values.Select(v => v.Length).Aggregate(0, (sum, l) => sum + l), timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to CSV");

            timer.Restart();

            using (var csv = new StreamWriter("float_timeseries.csv"))
            {
                for (int i = 0; i != dates.Length; ++i)
                {
                    for (int j = 0; j != objectIds.Length; ++j)
                    {
                        csv.WriteLine("{0:yyyy-MM-dd HH:mm:ss},{1},{2}", dates[i], objectIds[j], values[i][j]);
                    }
                }
            }

            Console.WriteLine("Saved to CSV ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.csv").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to CSV.GZ");

            timer.Restart();

            using (var stream = new FileStream("float_timeseries.csv.gz", FileMode.Create))
                using (var zip = new GZipStream(stream, CompressionLevel.Optimal))
                    using (var csv = new StreamWriter(zip))
                    {
                        for (int i = 0; i != dates.Length; ++i)
                        {
                            for (int j = 0; j != objectIds.Length; ++j)
                            {
                                csv.WriteLine("{0:yyyy-MM-dd HH:mm:ss},{1},{2}", dates[i], objectIds[j], values[i][j]);
                            }
                        }
                    }

            Console.WriteLine("Saved to CSV ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.csv.gz").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet");

            timer.Restart();

            using (var fileWriter = new ParquetFileWriter("float_timeseries.parquet", CreateFloatColumns()))
                using (var rowGroupWriter = fileWriter.AppendRowGroup())
                {
                    using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                    {
                        for (int i = 0; i != dates.Length; ++i)
                        {
                            dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray());
                        }
                    }

                    using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                    {
                        for (int i = 0; i != dates.Length; ++i)
                        {
                            objectIdWriter.WriteBatch(objectIds);
                        }
                    }

                    using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                    {
                        for (int i = 0; i != dates.Length; ++i)
                        {
                            valueWriter.WriteBatch(values[i]);
                        }
                    }
                }

            Console.WriteLine("Saved to Parquet ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.Chunked (by date)");

            timer.Restart();

            using (var fileWriter = new ParquetFileWriter("float_timeseries.parquet.chunked", CreateFloatColumns()))
            {
                for (int i = 0; i != dates.Length; ++i)
                {
                    using (var rowGroupWriter = fileWriter.AppendRowGroup())
                    {
                        using (var dateTimeWriter = rowGroupWriter.NextColumn().LogicalWriter <DateTime>())
                        {
                            dateTimeWriter.WriteBatch(Enumerable.Repeat(dates[i], objectIds.Length).ToArray());
                        }

                        using (var objectIdWriter = rowGroupWriter.NextColumn().LogicalWriter <int>())
                        {
                            objectIdWriter.WriteBatch(objectIds);
                        }

                        using (var valueWriter = rowGroupWriter.NextColumn().LogicalWriter <float>())
                        {
                            valueWriter.WriteBatch(values[i]);
                        }
                    }
                }
            }

            Console.WriteLine("Saved to Parquet.Chunked ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.chunked").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.RowOriented");

            timer.Restart();

            using (var rowWriter = ParquetFile.CreateRowWriter <(DateTime, int, float)>("float_timeseries.parquet.roworiented", new[] { "DateTime", "ObjectId", "Value" }))
            {
                for (int i = 0; i != dates.Length; ++i)
                {
                    for (int j = 0; j != objectIds.Length; ++j)
                    {
                        rowWriter.WriteRow((dates[i], objectIds[j], values[i][j]));
                    }
                }
            }

            Console.WriteLine("Saved to Parquet.RowOriented ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.roworiented").Length, timer.Elapsed.TotalSeconds);
            Console.WriteLine();
            Console.WriteLine("Saving to Parquet.NET");

            timer.Restart();

            {
                var dateTimeField = new DateTimeDataField("DateTime", DateTimeFormat.DateAndTime);
                var objectIdField = new DataField <int>("ObjectId");
                var valueField    = new DataField <float>("Value");
                var schema        = new Parquet.Data.Schema(dateTimeField, objectIdField, valueField);

                using (var stream = File.Create("float_timeseries.parquet.net"))
                    using (var parquetWriter = new ParquetWriter(schema, stream))
                        using (var groupWriter = parquetWriter.CreateRowGroup())
                        {
                            var dateTimeColumn = new DataColumn(dateTimeField,
                                                                dates.SelectMany(d => Enumerable.Repeat(new DateTimeOffset(d), objectIds.Length)).ToArray());

                            var objectIdColumn = new DataColumn(objectIdField,
                                                                dates.SelectMany(d => objectIds).ToArray());

                            var valueColumn = new DataColumn(valueField,
                                                             dates.SelectMany((d, i) => values[i]).ToArray());

                            groupWriter.WriteColumn(dateTimeColumn);
                            groupWriter.WriteColumn(objectIdColumn);
                            groupWriter.WriteColumn(valueColumn);
                        }
            }

            Console.WriteLine("Saved to Parquet.NET ({0:N0} bytes) in {1:N2} sec", new FileInfo("float_timeseries.parquet.net").Length, timer.Elapsed.TotalSeconds);
        }