Example #1
0
        public void Write_read_nullable_column(Array input)
        {
            var id = new DataField <int?>("id");
            var ms = new MemoryStream();

            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, input));
                }
            }

            ms.Position = 0;
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(1, reader.RowGroupCount);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(input.Length, rg.RowCount);
                    Assert.Equal(input, rg.ReadColumn(id).Data);
                }
            }
        }
Example #2
0
        public void FileMetadata_sets_num_rows_on_file_and_row_group_multiple_row_groups()
        {
            var ms = new MemoryStream();
            var id = new DataField <int>("id");

            //write
            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new[] { 1, 2, 3, 4 }));
                }

                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new[] { 5, 6 }));
                }
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(6, reader.ThriftMetadata.Num_rows);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(4, rg.RowCount);
                }

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1))
                {
                    Assert.Equal(2, rg.RowCount);
                }
            }
        }
Example #3
0
        public static void WriteAthenaRowsAsParquet(this Stream stream, ResultSetMetadata tableSchema, List <FieldMapping> mappings, IEnumerable <Row> rows)
        {
            List <DataColumn> columns = new List <DataColumn>();

            int index = 0;

            foreach (var column in tableSchema.ColumnInfo)
            {
                columns.Add(column.ToParquetColumn(mappings, index, rows));
                index++;
            }

            Schema schema = new Schema(new ReadOnlyCollection <Field>(columns.Select(column => column.Field).ToArray()));

            using (ParquetWriter writer = new ParquetWriter(schema, stream))
            {
                writer.CompressionMethod = CompressionMethod.Snappy;
                using (ParquetRowGroupWriter rowGroupWriter = writer.CreateRowGroup())
                {
                    foreach (var column in columns)
                    {
                        rowGroupWriter.WriteColumn(column);
                    }
                }
            }
        }
Example #4
0
        protected object WriteReadSingle(DataField field, object value)
        {
            using (var ms = new MemoryStream())
            {
                // write single value

                using (var writer = new ParquetWriter3(new Schema(field), ms))
                {
                    writer.CompressionMethod = CompressionMethod.None;

                    using (ParquetRowGroupWriter rg = writer.CreateRowGroup(1))
                    {
                        var column = new DataColumn(field);
                        column.Add(value);

                        rg.Write(column);
                    }
                }

                // read back single value

                ms.Position = 0;
                using (var reader = new ParquetReader3(ms))
                {
                    foreach (ParquetRowGroupReader rowGroupReader in reader)
                    {
                        DataColumn column = rowGroupReader.ReadColumn(field);

                        return(column.DefinedData.OfType <object>().FirstOrDefault());
                    }

                    return(null);
                }
            }
        }
Example #5
0
        public void CustomMetadata_can_write_and_read()
        {
            var ms = new MemoryStream();
            var id = new DataField <int>("id");

            //write
            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                writer.CustomMetadata = new Dictionary <string, string>
                {
                    ["key1"] = "value1",
                    ["key2"] = "value2"
                };

                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new[] { 1, 2, 3, 4 }));
                }
            }

            //read back
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal("value1", reader.CustomMetadata["key1"]);
                Assert.Equal("value2", reader.CustomMetadata["key2"]);
            }
        }
Example #6
0
        public void WriteIntro()
        {
            //create data columns with schema metadata and the data you need
            var idColumn = new DataColumn(
                new DataField <int>("id"),
                new int[] { 1, 2 });

            var cityColumn = new DataColumn(
                new DataField <string>("city"),
                new string[] { "London", "Derby" });

            // create file schema
            var schema = new Schema(idColumn.Field, cityColumn.Field);

            using (Stream fileStream = System.IO.File.OpenWrite("c:\\test.parquet"))
            {
                using (var parquetWriter = new ParquetWriter(schema, fileStream))
                {
                    // create a new row group in the file
                    using (ParquetRowGroupWriter groupWriter = parquetWriter.CreateRowGroup(2))
                    {
                        groupWriter.WriteColumn(idColumn);
                        groupWriter.WriteColumn(cityColumn);
                    }
                }
            }
        }
Example #7
0
        static void ConvertCsvToParquet(string inputFile, string outputFile)
        {
            var data = new Dictionary <string, ArrayList>();

            using (var reader = new StreamReader(inputFile, true))
            {
                var header = reader.ReadLine();

                var columns = header.Split(",");
                for (int i = 0; i < columns.Length; i++)
                {
                    columns[i] = columns[i].Trim();
                }

                while (!reader.EndOfStream)
                {
                    var line = reader.ReadLine();
                    if (String.IsNullOrEmpty(line))
                    {
                        continue;
                    }

                    var parts = line.Split(",");
                    for (int i = 0; i < parts.Length && i < columns.Length; i++)
                    {
                        var column = columns[i];

                        if (parquet_types.ContainsKey(column))
                        {
                            if (!data.ContainsKey(column))
                            {
                                data.Add(column, new ArrayList());
                            }

                            data[column].Add(ParseValue(parquet_types[column], parts[i]));
                        }
                    }
                }
            }

            var datacolumns = parquet_types.Select(
                x => new DataColumn(CreateParquetField(x.Key, x.Value), data[x.Key].ToArray(ConvertParquetType(x.Value)))
                ).ToArray();
            var schema = new Schema(datacolumns.Select(x => (Field)x.Field).ToArray());

            using (Stream fileStream = System.IO.File.OpenWrite(outputFile))
            {
                using (var parquetWriter = new ParquetWriter(schema, fileStream))
                {
                    // create a new row group in the file
                    using (ParquetRowGroupWriter groupWriter = parquetWriter.CreateRowGroup())
                    {
                        foreach (var column in datacolumns)
                        {
                            groupWriter.WriteColumn(column);
                        }
                    }
                }
            }
        }
Example #8
0
        /// <inheritdoc/>
        public void Write(IList <IColumn> columns)
        {
            List <DataColumn> parquetColumns = CreateParquetColumns(columns);
            List <DataField>  parquetFields  = parquetColumns.Select(p => p.Field).ToList();
            Schema            schema         = new Schema(parquetFields);

            using (var parquetWriter = new ParquetWriter(schema, FileStream))
            {
                // TODO - Write is called many times; one for each rowgroup in the file. We do not need to compile
                // and write metadata many times. Refactor to write metadata only once.
                CryptoMetadata metadata = CompileMetadata(columns, FileEncryptionSettings);
                if (!metadata.IsEmpty())
                {
                    parquetWriter.CustomMetadata = new Dictionary <string, string>
                    {
                        [nameof(CryptoMetadata)] = JsonConvert.SerializeObject(
                            value: metadata,
                            settings: new JsonSerializerSettings()
                        {
                            NullValueHandling = NullValueHandling.Ignore,
                            Converters        = { new StringEnumConverter() },
                            Formatting        = Formatting.Indented
                        })
                    };
                }

                // create a new row group in the file
                using (ParquetRowGroupWriter groupWriter = parquetWriter.CreateRowGroup())
                {
                    parquetColumns.ForEach(groupWriter.WriteColumn);
                }
            }
        }
Example #9
0
        //TODO: Implement this class from scratch by leveraging grouping queries in the DB engine
        //TODO: Unit & Integration Test
        //https://stackoverflow.com/questions/50933429/how-to-view-apache-parquet-file-in-windows
        //https://github.com/elastacloud/parquet-dotnet
        public void WriteData(IEnumerable <PriceForecast> data, string basePath)
        {
            var forecastsByCountry = data.GroupBy(f => f.CountryCode);

            foreach (var countryGroup in forecastsByCountry)
            {
                var country = countryGroup.Key;

                var forecastsByCategory = countryGroup.ToList().GroupBy(f => f.Category);
                foreach (var categoryGroup in forecastsByCategory)
                {
                    var category = categoryGroup.Key;

                    var forecastsByYear = categoryGroup.ToList().GroupBy(f => f.ForecastedDate.Year);
                    foreach (var yearGroup in forecastsByYear)
                    {
                        var year = yearGroup.Key;

                        var forecastsByMonth = yearGroup.ToList().GroupBy(f => f.ForecastedDate.Month);
                        foreach (var monthGroup in forecastsByMonth)
                        {
                            var month     = monthGroup.Key;
                            var forecasts = monthGroup.ToList();
                            var dirPath   = $"{basePath}/Country={country}/Category={category}/Year={year}/Month={month}/";
                            var filePath  = dirPath + "forecast.parquet";

                            //TODO: automating schema generation using reflection and attributes
                            var columns = new DataColumn[]
                            {
                                new DataColumn(ParquetSchemaHelper.ForecastDateField, forecasts.Select(f => f.ForecastDateTime.ToString()).ToArray()),
                                new DataColumn(ParquetSchemaHelper.ForecastModelField, forecasts.Select(f => f.ForecastModel).ToArray()),
                                new DataColumn(ParquetSchemaHelper.MarketField, forecasts.Select(f => f.Market).ToArray()),
                                new DataColumn(ParquetSchemaHelper.ProductField, forecasts.Select(f => f.Product).ToArray()),
                                new DataColumn(ParquetSchemaHelper.CountryField, forecasts.Select(f => f.CountryCode).ToArray()),
                                new DataColumn(ParquetSchemaHelper.ForecastedDateField, forecasts.Select(f => f.ForecastedDate.ToString()).ToArray()),
                                new DataColumn(ParquetSchemaHelper.CategoryField, forecasts.Select(f => f.Category).ToArray()),
                                new DataColumn(ParquetSchemaHelper.PriceField, forecasts.Select(f => f.Price).ToArray())
                            };


                            var           schema = new Schema(columns.Select(c => c.Field).ToArray());
                            DirectoryInfo di     = Directory.CreateDirectory(dirPath);//safe
                            using Stream fileStream = System.IO.File.OpenWrite(filePath);
                            using var parquetWriter = new ParquetWriter(schema, fileStream);
                            using (ParquetRowGroupWriter groupWriter = parquetWriter.CreateRowGroup())
                            {
                                foreach (var col in columns)
                                {
                                    groupWriter.WriteColumn(col);
                                }
                            }
                        }
                    }
                }
            }
            return;
        }
Example #10
0
 /// <summary>
 ///		Escribe un grupo de filas en el archivo
 /// </summary>
 private void FlushRowGroup(ParquetWriter writer, Table table)
 {
     if (table.Count > 0)
     {
         using (ParquetRowGroupWriter rowGroupWriter = writer.CreateRowGroup())
         {
             rowGroupWriter.Write(table);
         }
     }
 }
Example #11
0
        public void Write_in_small_row_groups()
        {
            //write a single file having 3 row groups
            var id = new DataField <int>("id");
            var ms = new MemoryStream();

            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 1 }));
                }

                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 2 }));
                }

                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 3 }));
                }
            }

            //read the file back and validate
            ms.Position = 0;
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(3, reader.RowGroupCount);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(1, rg.RowCount);
                    DataColumn dc = rg.ReadColumn(id);
                    Assert.Equal(new int[] { 1 }, dc.Data);
                }

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1))
                {
                    Assert.Equal(1, rg.RowCount);
                    DataColumn dc = rg.ReadColumn(id);
                    Assert.Equal(new int[] { 2 }, dc.Data);
                }

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(2))
                {
                    Assert.Equal(1, rg.RowCount);
                    DataColumn dc = rg.ReadColumn(id);
                    Assert.Equal(new int[] { 3 }, dc.Data);
                }
            }
        }
Example #12
0
        public void Cannot_write_columns_in_wrong_order()
        {
            var schema = new Schema(new DataField <int>("id"), new DataField <int>("id2"));

            using (var writer = new ParquetWriter(schema, new MemoryStream()))
            {
                using (ParquetRowGroupWriter gw = writer.CreateRowGroup())
                {
                    Assert.Throws <ArgumentException>(() =>
                    {
                        gw.WriteColumn(new DataColumn((DataField)schema[1], new int[] { 1 }));
                    });
                }
            }
        }
        public static void WriteParquetColumns(this Stream stream, List <DataColumn> columns)
        {
            Schema schema = new Schema(new ReadOnlyCollection <Field>(columns.Select(column => column.Field).ToArray()));

            using (ParquetWriter writer = new ParquetWriter(schema, stream))
            {
                writer.CompressionMethod = CompressionMethod.Snappy;
                using (ParquetRowGroupWriter rowGroupWriter = writer.CreateRowGroup()) // items.Count()
                {
                    foreach (var column in columns)
                    {
                        rowGroupWriter.WriteColumn(column);
                    }
                }
            }
        }
Example #14
0
        public static void WriteParquet <T>(this Stream stream, IEnumerable <T> items) where T : class
        {
            Type classType = typeof(T);

            var properties = classType.GetProperties();

            List <DataColumn> columns = new List <DataColumn>();

            foreach (var prop in properties)
            {
                if (prop.PropertyType == DateTimeType)
                {
                    columns.Add(new DataColumn(
                                    new DateTimeDataField(prop.Name, DateTimeFormat.DateAndTime),
                                    items.Select(item => new DateTimeOffset(((DateTime)prop.GetValue(item)))).ToArray()
                                    ));
                }
                else
                {
                    var       genericArguments   = new Type[] { prop.PropertyType };
                    var       genericType        = DataFieldGenericType.MakeGenericType(genericArguments);
                    var       genericConstructor = genericType.GetConstructor(DataFieldConstructorGenericArguments);
                    DataField field         = genericConstructor.Invoke(new object[] { prop.Name }) as DataField;
                    var       dataSource    = items.Select(item => prop.GetValue(item));
                    var       castMethod    = CastMethodGeneric.MakeGenericMethod(genericArguments);
                    var       toArrayMethod = ToArrayMethodGeneric.MakeGenericMethod(genericArguments);
                    var       data          = toArrayMethod.Invoke(null, new object[] { castMethod.Invoke(null, new object[] { dataSource }) }) as Array;
                    var       column        = new DataColumn(field, data);
                    columns.Add(column);
                }
            }

            Schema schema = new Schema(new ReadOnlyCollection <Field>(columns.Select(column => column.Field).ToArray()));

            using (ParquetWriter writer = new ParquetWriter(schema, stream))
            {
                writer.CompressionMethod = CompressionMethod.Snappy;
                using (ParquetRowGroupWriter rowGroupWriter = writer.CreateRowGroup()) // items.Count()
                {
                    foreach (var column in columns)
                    {
                        rowGroupWriter.WriteColumn(column);
                    }
                }
            }
        }
Example #15
0
        public void Write_multiple_row_groups_to_forward_only_stream()
        {
            var ms          = new MemoryStream();
            var forwardOnly = new WriteableNonSeekableStream(ms);

            var schema = new Schema(
                new DataField <int>("id"),
                new DataField <string>("nonsense"));

            using (var writer = new ParquetWriter(schema, forwardOnly))
            {
                using (ParquetRowGroupWriter rgw = writer.CreateRowGroup(1))
                {
                    rgw.WriteColumn(new DataColumn((DataField)schema[0], new[] { 1 }));
                    rgw.WriteColumn(new DataColumn((DataField)schema[1], new[] { "1" }));
                }

                using (ParquetRowGroupWriter rgw = writer.CreateRowGroup(1))
                {
                    rgw.WriteColumn(new DataColumn((DataField)schema[0], new[] { 2 }));
                    rgw.WriteColumn(new DataColumn((DataField)schema[1], new[] { "2" }));
                }
            }

            ms.Position = 0;
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(2, reader.RowGroupCount);

                using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(1, rgr.RowCount);

                    DataColumn column = rgr.ReadColumn((DataField)schema[0]);
                    Assert.Equal(1, column.Data.GetValue(0));
                }

                using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(1))
                {
                    Assert.Equal(1, rgr.RowCount);

                    DataColumn column = rgr.ReadColumn((DataField)schema[0]);
                    Assert.Equal(2, column.Data.GetValue(0));
                }
            }
        }
Example #16
0
        public void Write_simplest_int_and_string_columns_in_one_row_group()
        {
            var schema = new Schema(new DataField <int>("id"), new DataField <string>("name"));

            using (var ms = new MemoryStream())
            {
                using (var writer = new ParquetWriter3(schema, ms))
                {
                    writer.CompressionMethod = CompressionMethod.None;

                    using (ParquetRowGroupWriter group = writer.CreateRowGroup(3))
                    {
                        group.Write(CreateColumn(schema[0], 1, 2, 3));
                        group.Write(CreateColumn(schema[1], "first", "second", "third"));
                    }
                }
            }
        }
Example #17
0
        private static void WriteDataInFile(string path, List <DataColumn> schemaColumns)
        {
            var schema = new Schema(schemaColumns.ConvertAll(col => col.Field));

            using (Stream fileStream = File.Create(path))
            {
                using (var parquetWriter = new ParquetWriter(schema, fileStream))
                {
                    using (ParquetRowGroupWriter groupWriter = parquetWriter.CreateRowGroup())
                    {
                        foreach (var column in schemaColumns)
                        {
                            groupWriter.WriteColumn(column);
                        }
                    }
                }
            }
        }
Example #18
0
        public static void BuildParquetFile(DataColumn license, DataColumn sensor, DataColumn time, string outPath)
        {
            var schema = new Schema(license.Field, sensor.Field, time.Field);

            using (Stream fileStream = File.Create(outPath))
            {
                using (var parquetWriter = new ParquetWriter(schema, fileStream))
                {
                    parquetWriter.CompressionMethod = CompressionMethod.Gzip;
                    using (ParquetRowGroupWriter groupWriter = parquetWriter.CreateRowGroup())
                    {
                        groupWriter.WriteColumn(license);
                        groupWriter.WriteColumn(sensor);
                        groupWriter.WriteColumn(time);
                    }
                }
            }
        }
        protected object WriteReadSingle(DataField field, object value, CompressionMethod compressionMethod = CompressionMethod.None)
        {
            //for sanity, use disconnected streams
            byte[] data;

            using (var ms = new MemoryStream())
            {
                // write single value

                using (var writer = new ParquetWriter3(new Schema(field), ms))
                {
                    writer.CompressionMethod = compressionMethod;

                    using (ParquetRowGroupWriter rg = writer.CreateRowGroup(1))
                    {
                        var column = new DataColumn(field);
                        column.Add(value);

                        rg.Write(column);
                    }
                }

                data = ms.ToArray();

                //F.WriteAllBytes($"c:\\tmp\\{compressionMethod}.parquet", data);
            }

            using (var ms = new MemoryStream(data))
            {
                // read back single value

                ms.Position = 0;
                using (var reader = new ParquetReader3(ms))
                {
                    using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(0))
                    {
                        DataColumn column = rowGroupReader.ReadColumn(field);

                        return(column.DefinedData.OfType <object>().FirstOrDefault());
                    }
                }
            }
        }
Example #20
0
        protected object WriteReadSingle(DataField field, object value, CompressionMethod compressionMethod = CompressionMethod.None, int compressionLevel = -1)
        {
            //for sanity, use disconnected streams
            byte[] data;

            using (var ms = new MemoryStream())
            {
                // write single value

                using (var writer = new ParquetWriter(new Schema(field), ms))
                {
                    writer.CompressionMethod = compressionMethod;
                    writer.CompressionLevel  = compressionLevel;

                    using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                    {
                        Array dataArray = Array.CreateInstance(field.ClrNullableIfHasNullsType, 1);
                        dataArray.SetValue(value, 0);
                        var column = new DataColumn(field, dataArray);

                        rg.WriteColumn(column);
                    }
                }

                data = ms.ToArray();
            }

            using (var ms = new MemoryStream(data))
            {
                // read back single value

                ms.Position = 0;
                using (var reader = new ParquetReader(ms))
                {
                    using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(0))
                    {
                        DataColumn column = rowGroupReader.ReadColumn(field);

                        return(column.Data.GetValue(0));
                    }
                }
            }
        }
Example #21
0
        public void Append_to_file_reads_all_data()
        {
            //write a file with a single row group
            var id = new DataField <int>("id");
            var ms = new MemoryStream();

            using (var writer = new ParquetWriter(new Schema(id), ms))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 1, 2 }));
                }
            }

            //append to this file. Note that you cannot append to existing row group, therefore create a new one
            ms.Position = 0;
            using (var writer = new ParquetWriter(new Schema(id), ms, append: true))
            {
                using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                {
                    rg.WriteColumn(new DataColumn(id, new int[] { 3, 4 }));
                }
            }

            //check that this file now contains two row groups and all the data is valid
            ms.Position = 0;
            using (var reader = new ParquetReader(ms))
            {
                Assert.Equal(2, reader.RowGroupCount);

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(0))
                {
                    Assert.Equal(2, rg.RowCount);
                    Assert.Equal(new int[] { 1, 2 }, rg.ReadColumn(id).Data);
                }

                using (ParquetRowGroupReader rg = reader.OpenRowGroupReader(1))
                {
                    Assert.Equal(2, rg.RowCount);
                    Assert.Equal(new int[] { 3, 4 }, rg.ReadColumn(id).Data);
                }
            }
        }
Example #22
0
        protected object WriteReadSingle(
            DataField field, object value,
            CompressionMethod compressionMethod = CompressionMethod.None,
            bool flushToDisk = false)
        {
            using (var ms = new MemoryStream())
            {
                // write single value

                using (var writer = new ParquetWriter3(new Schema(field), ms))
                {
                    writer.CompressionMethod = compressionMethod;

                    using (ParquetRowGroupWriter rg = writer.CreateRowGroup(1))
                    {
                        var column = new DataColumn(field);
                        column.Add(value);

                        rg.Write(column);
                    }
                }

                if (flushToDisk)
                {
                    FlushTempFile(ms);
                }


                // read back single value

                ms.Position = 0;
                using (var reader = new ParquetReader3(ms))
                {
                    using (ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(0))
                    {
                        DataColumn column = rowGroupReader.ReadColumn(field);

                        return(column.DefinedData.OfType <object>().FirstOrDefault());
                    }
                }
            }
        }
Example #23
0
        private void WriteGroup(ParquetWriter parquetWriter)
        {
            AddMandatoryFields();

            using (ParquetRowGroupWriter groupWriter = parquetWriter.CreateRowGroup())
            {
                foreach (var kvp in _group.Data)
                {
                    if (Fields.TryGetValue(kvp.Key, out var field) == false)
                    {
                        continue;
                    }

                    var   data = kvp.Value;
                    Array array;

                    switch (field.DataType)
                    {
                    case DataType.Boolean:
                        array = _boolArr ??= new bool[data.Count];
                        break;

                    case DataType.Byte:
                        array = _byteArr ??= new byte[data.Count];
                        break;

                    case DataType.SignedByte:
                        array = _sbyteArr ??= new sbyte[data.Count];
                        break;

                    case DataType.Short:
                        array = _shortArr ??= new short[data.Count];
                        break;

                    case DataType.Int32:
                        array = _intArr ??= new int[data.Count];
                        break;

                    case DataType.Int64:
                        array = _longArr ??= new long[data.Count];
                        break;

                    case DataType.UnsignedInt16:
                        array = _ushortArr ??= new ushort[data.Count];
                        break;

                    case DataType.UnsignedInt32:
                        array = _uintArr ??= new uint[data.Count];
                        break;

                    case DataType.UnsignedInt64:
                        array = _ulongArr ??= new ulong[data.Count];
                        break;

                    case DataType.Float:
                        array = _floatArr ??= new float[data.Count];
                        break;

                    case DataType.Double:
                        array = _doubleArr ??= new double[data.Count];
                        break;

                    case DataType.Decimal:
                        array = _decimalArr ??= new decimal[data.Count];
                        break;

                    case DataType.String:
                        array = _strArr ??= new string[data.Count];
                        break;

                    case DataType.DateTimeOffset:
                        array = _dtoArr ??= new DateTimeOffset[data.Count];
                        break;

                    case DataType.TimeSpan:
                        array = _tsArr ??= new TimeSpan[data.Count];
                        break;

                    default:
                        ThrowUnsupportedDataType(field.DataType);
                        return;
                    }

                    Debug.Assert(array.Length == data.Count, $"Invalid field data on property '{kvp.Key}'");

                    data.CopyTo(array, 0);
                    groupWriter.WriteColumn(new DataColumn(field, array));
                }
            }

            _boolArr    = null;
            _strArr     = null;
            _dtoArr     = null;
            _tsArr      = null;
            _byteArr    = null;
            _sbyteArr   = null;
            _shortArr   = null;
            _intArr     = null;
            _longArr    = null;
            _ushortArr  = null;
            _uintArr    = null;
            _ulongArr   = null;
            _floatArr   = null;
            _doubleArr  = null;
            _decimalArr = null;
        }
Example #24
0
        public void SaveParquet(IEnumerable <TrainingWindTurbineRecord> records, string outputFile)
        {
            //create data columns with schema metadata and the data you need
            var turbineId = new DataColumn(
                new DataField <int>("TurbineId"),
                records.Select(x => x.TurbineId).ToArray());

            var gearboxOilLevel = new DataColumn(
                new DataField <double>("GearboxOilLevel"),
                records.Select(x => x.GearboxOilLevel).ToArray());

            var gearboxOilTemp = new DataColumn(
                new DataField <double>("GearboxOilTemp"),
                records.Select(x => x.GearboxOilTemp).ToArray());

            var generatorActivePower = new DataColumn(
                new DataField <double>("GeneratorActivePower"),
                records.Select(x => x.GeneratorActivePower).ToArray());

            var generatorSpeed = new DataColumn(
                new DataField <double>("GeneratorSpeed"),
                records.Select(x => x.GeneratorSpeed).ToArray());

            var generatorTemp = new DataColumn(
                new DataField <double>("GeneratorTemp"),
                records.Select(x => x.GeneratorTemp).ToArray());

            var generatorTorque = new DataColumn(
                new DataField <double>("GeneratorTorque"),
                records.Select(x => x.GeneratorTorque).ToArray());

            var gridFrequency = new DataColumn(
                new DataField <double>("GridFrequency"),
                records.Select(x => x.GridFrequency).ToArray());

            var gridVoltage = new DataColumn(
                new DataField <double>("GridVoltage"),
                records.Select(x => x.GridVoltage).ToArray());

            var hydraulicOilPressure = new DataColumn(
                new DataField <double>("HydraulicOilPressure"),
                records.Select(x => x.HydraulicOilPressure).ToArray());

            var nacelleAngle = new DataColumn(
                new DataField <double>("NacelleAngle"),
                records.Select(x => x.NacelleAngle).ToArray());

            var overallWindDirection = new DataColumn(
                new DataField <double>("OverallWindDirection"),
                records.Select(x => x.OverallWindDirection).ToArray());

            var overalWindSpeedStdDev = new DataColumn(
                new DataField <double>("WindSpeedStdDev"),
                records.Select(x => x.WindSpeedStdDev).ToArray());

            var precipitation = new DataColumn(
                new DataField <bool>("Precipitation"),
                records.Select(x => x.Precipitation).ToArray());

            var turbineWindDirection = new DataColumn(
                new DataField <double>("TurbineWindDirection"),
                records.Select(x => x.TurbineWindDirection).ToArray());

            var turbineSpeedStdDev = new DataColumn(
                new DataField <double>("TurbineSpeedStdDev"),
                records.Select(x => x.TurbineSpeedStdDev).ToArray());

            var windSpeedAverage = new DataColumn(
                new DataField <double>("WindSpeedAverage"),
                records.Select(x => x.WindSpeedAverage).ToArray());

            var windTempAverage = new DataColumn(
                new DataField <double>("WindTempAverage"),
                records.Select(x => x.WindTempAverage).ToArray());

            var alterBlades = new DataColumn(
                new DataField <bool>("AlterBlades"),
                records.Select(x => x.AlterBlades).ToArray());

            var pitchAngle = new DataColumn(
                new DataField <double>("PitchAngle"),
                records.Select(x => x.PitchAngle).ToArray());

            var vibration = new DataColumn(
                new DataField <double>("Vibration"),
                records.Select(x => x.Vibration).ToArray());

            var turbineSpeedAverage = new DataColumn(
                new DataField <double>("TurbineSpeedAverage"),
                records.Select(x => x.TurbineSpeedAverage).ToArray());

            // create file schema
            var schema = new Schema(turbineId.Field, gearboxOilLevel.Field, gearboxOilTemp.Field, generatorActivePower.Field, generatorSpeed.Field, generatorTemp.Field, generatorTorque.Field, gridFrequency.Field, gridVoltage.Field, hydraulicOilPressure.Field,
                                    nacelleAngle.Field, overallWindDirection.Field, overalWindSpeedStdDev.Field, precipitation.Field, turbineWindDirection.Field, turbineSpeedStdDev.Field, windSpeedAverage.Field, windTempAverage.Field, pitchAngle.Field, vibration.Field, turbineSpeedAverage.Field, alterBlades.Field);

            var outputPath = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location);

            using (Stream fileStream = File.OpenWrite(Path.Combine(outputPath, $"{outputFile}.parquet")))
            {
                using (var parquetWriter = new ParquetWriter(schema, fileStream))
                {
                    // create a new row group in the file
                    using (ParquetRowGroupWriter groupWriter = parquetWriter.CreateRowGroup())
                    {
                        groupWriter.WriteColumn(turbineId);
                        groupWriter.WriteColumn(gearboxOilLevel);
                        groupWriter.WriteColumn(gearboxOilTemp);
                        groupWriter.WriteColumn(generatorActivePower);
                        groupWriter.WriteColumn(generatorSpeed);
                        groupWriter.WriteColumn(generatorTemp);
                        groupWriter.WriteColumn(generatorTorque);
                        groupWriter.WriteColumn(gridFrequency);
                        groupWriter.WriteColumn(gridVoltage);
                        groupWriter.WriteColumn(hydraulicOilPressure);
                        groupWriter.WriteColumn(nacelleAngle);
                        groupWriter.WriteColumn(overallWindDirection);
                        groupWriter.WriteColumn(overalWindSpeedStdDev);
                        groupWriter.WriteColumn(precipitation);
                        groupWriter.WriteColumn(turbineWindDirection);
                        groupWriter.WriteColumn(turbineSpeedStdDev);
                        groupWriter.WriteColumn(windSpeedAverage);
                        groupWriter.WriteColumn(windTempAverage);
                        groupWriter.WriteColumn(pitchAngle);
                        groupWriter.WriteColumn(vibration);
                        groupWriter.WriteColumn(turbineSpeedAverage);
                        groupWriter.WriteColumn(alterBlades);
                    }
                }
            }
        }
Example #25
0
        private static void ReadLargeFile(out TimeSpan readTime,
                                          out TimeSpan uncompressedWriteTime,
                                          out TimeSpan gzipWriteTime)
        {
            Schema schema;

            DataColumn[] columns;

            using (var time = new TimeMeasure())
            {
                using (var reader = ParquetReader.OpenFromFile(@"C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet", new ParquetOptions {
                    TreatByteArrayAsString = true
                }))
                {
                    schema = reader.Schema;
                    var cl = new List <DataColumn>();

                    using (ParquetRowGroupReader rgr = reader.OpenRowGroupReader(0))
                    {
                        foreach (DataField field in reader.Schema.GetDataFields())
                        {
                            DataColumn dataColumn = rgr.ReadColumn(field);
                            cl.Add(dataColumn);
                        }
                    }
                    columns = cl.ToArray();
                }
                readTime = time.Elapsed;
            }

            using (FileStream dest = F.OpenWrite("perf.uncompressed.parquet"))
            {
                using (var time = new TimeMeasure())
                {
                    using (var writer = new ParquetWriter(schema, dest))
                    {
                        writer.CompressionMethod = CompressionMethod.None;
                        using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                        {
                            foreach (DataColumn dc in columns)
                            {
                                rg.WriteColumn(dc);
                            }
                        }
                    }

                    uncompressedWriteTime = time.Elapsed;
                }
            }


            using (FileStream dest = F.OpenWrite("perf.gzip.parquet"))
            {
                using (var time = new TimeMeasure())
                {
                    using (var writer = new ParquetWriter(schema, dest))
                    {
                        writer.CompressionMethod = CompressionMethod.Gzip;
                        using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
                        {
                            foreach (DataColumn dc in columns)
                            {
                                rg.WriteColumn(dc);
                            }
                        }
                    }

                    gzipWriteTime = time.Elapsed;
                }
            }
        }
Example #26
0
        /// <summary>
        /// Writes dataLen rows and typed columns to the file.
        /// </summary>
        /// <param name="csvColumns">Processed CSV data</param>
        /// <param name="dataLen">Row count</param>
        /// <param name="writer">ParquetWriter</param>
        /// <param name="fields">Field structure</param>
        /// <param name="config">Config structure</param>
        public static void WriteGroup(List <Object> csvColumns, long dataLen, ParquetWriter writer, List <DataField> fields, Config config)
        {
            using (ParquetRowGroupWriter rg = writer.CreateRowGroup())
            {
                for (int i = 0; i < fields.Count; i++)
                {
                    if (fields[i].HasNulls)
                    {
                        switch (fields[i].DataType)
                        {
                        case DataType.Boolean:
                            rg.WriteColumn(new DataColumn(fields[i], ((bool?[])csvColumns[i])));
                            break;

                        case DataType.DateTimeOffset:
                            rg.WriteColumn(new DataColumn(fields[i], ((DateTimeOffset?[])csvColumns[i])));
                            break;

                        case DataType.Decimal:
                            rg.WriteColumn(new DataColumn(fields[i], ((decimal?[])csvColumns[i])));
                            break;

                        case DataType.Double:
                            rg.WriteColumn(new DataColumn(fields[i], ((double?[])csvColumns[i])));
                            break;

                        case DataType.Float:
                            rg.WriteColumn(new DataColumn(fields[i], ((float?[])csvColumns[i])));
                            break;

                        case DataType.Int16:
                            rg.WriteColumn(new DataColumn(fields[i], ((Int16?[])csvColumns[i])));
                            break;

                        case DataType.Int32:
                            rg.WriteColumn(new DataColumn(fields[i], ((Int32?[])csvColumns[i])));
                            break;

                        case DataType.Int64:
                            rg.WriteColumn(new DataColumn(fields[i], ((Int64?[])csvColumns[i])));
                            break;

                        case DataType.String:
                            rg.WriteColumn(new DataColumn(fields[i], ((string[])csvColumns[i])));
                            break;

                        default:
                            throw new ArgumentOutOfRangeException(fields[i].DataType.ToString());
                        }
                    }
                    else
                    {
                        switch (fields[i].DataType)
                        {
                        case DataType.Boolean:
                            rg.WriteColumn(new DataColumn(fields[i], ((bool[])csvColumns[i])));
                            break;

                        case DataType.DateTimeOffset:
                            rg.WriteColumn(new DataColumn(fields[i], ((DateTimeOffset[])csvColumns[i])));
                            break;

                        case DataType.Decimal:
                            rg.WriteColumn(new DataColumn(fields[i], ((decimal[])csvColumns[i])));
                            break;

                        case DataType.Double:
                            rg.WriteColumn(new DataColumn(fields[i], ((double[])csvColumns[i])));
                            break;

                        case DataType.Float:
                            rg.WriteColumn(new DataColumn(fields[i], ((float[])csvColumns[i])));
                            break;

                        case DataType.Int16:
                            rg.WriteColumn(new DataColumn(fields[i], ((Int16[])csvColumns[i])));
                            break;

                        case DataType.Int32:
                            rg.WriteColumn(new DataColumn(fields[i], ((Int32[])csvColumns[i])));
                            break;

                        case DataType.Int64:
                            rg.WriteColumn(new DataColumn(fields[i], ((Int64[])csvColumns[i])));
                            break;

                        case DataType.String:
                            rg.WriteColumn(new DataColumn(fields[i], ((string[])csvColumns[i])));
                            break;

                        default:
                            throw new ArgumentOutOfRangeException(fields[i].DataType.ToString());
                        }
                    }
                }
            }
        }