/// <summary> /// Writes <see cref="DataSet"/> to a target file /// </summary> /// <param name="dataSet"><see cref="DataSet"/> to write</param> /// <param name="fileName">Path to a file to write to.</param> /// <param name="compression">Compression method</param> /// <param name="formatOptions">Parquet options, optional.</param> /// <param name="writerOptions">Writer options, optional.</param> /// <param name="append">When true, assumes that this stream contains existing file and appends data to it, otherwise writes a new Parquet file.</param> public static void WriteFile( DataSet dataSet, string fileName, CompressionMethod compression = CompressionMethod.Gzip, ParquetOptions formatOptions = null, WriterOptions writerOptions = null, bool append = false) { using (Stream fs = System.IO.File.Create(fileName)) { using (var writer = new ParquetWriter(fs, formatOptions, writerOptions)) { writer.Write(dataSet, compression); } } }
public static Schema Serialize <T>(IEnumerable <T> objectInstances, Stream destination, Schema schema = null, WriterOptions writerOptions = null, CompressionMethod compressionMethod = CompressionMethod.Snappy) where T : new() { if (objectInstances == null) { throw new ArgumentNullException(nameof(objectInstances)); } if (destination == null) { throw new ArgumentNullException(nameof(destination)); } if (!destination.CanWrite) { throw new ArgumentException("stream must be writeable", nameof(destination)); } //if schema is not passed reflect it if (schema == null) { schema = SchemaReflector.Reflect <T>(); } if (writerOptions == null) { writerOptions = new WriterOptions(); } var extractor = new ColumnExtractor(); using (var writer = new ParquetWriter3(schema, destination, writerOptions: writerOptions)) { writer.CompressionMethod = compressionMethod; foreach (IEnumerable <T> batch in objectInstances.Batch(writerOptions.RowGroupsSize)) { IReadOnlyCollection <DataColumn> columns = extractor.ExtractColumns(batch, schema); using (ParquetRowGroupWriter groupWriter = writer.CreateRowGroup(batch.Count())) { foreach (DataColumn dataColumn in columns) { groupWriter.Write(dataColumn); } } } } return(schema); }
/// <summary> /// Creates an instance of parquet writer on top of a stream /// </summary> /// <param name="output">Writeable, seekable stream</param> /// <param name="formatOptions">Additional options</param> /// <param name="writerOptions">The writer options.</param> /// <exception cref="ArgumentNullException">Output is null.</exception> /// <exception cref="ArgumentException">Output stream is not writeable</exception> public ParquetWriter(Stream output, ParquetOptions formatOptions = null, WriterOptions writerOptions = null) : base(new GapStream(output)) { if (output == null) { throw new ArgumentNullException(nameof(output)); } if (!output.CanWrite) { throw new ArgumentException("stream is not writeable", nameof(output)); } _formatOptions = formatOptions ?? new ParquetOptions(); _writerOptions = writerOptions ?? new WriterOptions(); }
/// <summary> /// Creates an instance of parquet writer on top of a stream /// </summary> /// <param name="output">Writeable, seekable stream</param> /// <param name="formatOptions">Additional options</param> /// <param name="writerOptions">The writer options.</param> /// <exception cref="ArgumentNullException">Output is null.</exception> /// <exception cref="ArgumentException">Output stream is not writeable</exception> public ParquetWriter(Stream output, ParquetOptions formatOptions = null, WriterOptions writerOptions = null) : base(output) { _output = output ?? throw new ArgumentNullException(nameof(output)); if (!output.CanWrite) { throw new ArgumentException("stream is not writeable", nameof(output)); } _formatOptions = formatOptions ?? new ParquetOptions(); _writerOptions = writerOptions ?? new WriterOptions(); _meta = new FileMetadataBuilder(_writerOptions); _plainWriter = new PlainValuesWriter(_formatOptions); _rleWriter = new RunLengthBitPackingHybridValuesWriter(); _dicWriter = new PlainDictionaryValuesWriter(_rleWriter); }
/// <summary> /// Creates an instance of parquet writer on top of a stream /// </summary> /// <param name="output">Writeable, seekable stream</param> /// <param name="formatOptions">Additional options</param> /// <param name="writerOptions">The writer options.</param> /// <exception cref="ArgumentNullException">Output is null.</exception> /// <exception cref="ArgumentException">Output stream is not writeable</exception> public ParquetWriter(Stream output, ParquetOptions formatOptions = null, WriterOptions writerOptions = null) : base(new PositionTrackingStream(output)) { if (output == null) { throw new ArgumentNullException(nameof(output)); } if (!output.CanWrite) { throw new ArgumentException("stream is not writeable", nameof(output)); } _formatOptions = formatOptions ?? new ParquetOptions(); _writerOptions = writerOptions ?? new WriterOptions(); _meta = new FileMetadataBuilder(); }
/// <summary> /// Creates an instance of parquet writer on top of a stream /// </summary> /// <param name="schema"></param> /// <param name="output">Writeable, seekable stream</param> /// <param name="formatOptions">Additional options</param> /// <param name="writerOptions">The writer options.</param> /// <param name="append"></param> /// <exception cref="ArgumentNullException">Output is null.</exception> /// <exception cref="ArgumentException">Output stream is not writeable</exception> public ParquetWriter3(Schema schema, Stream output, ParquetOptions formatOptions = null, WriterOptions writerOptions = null, bool append = false) : base(new GapStream(output)) { if (output == null) { throw new ArgumentNullException(nameof(output)); } if (!output.CanWrite) { throw new ArgumentException("stream is not writeable", nameof(output)); } _schema = schema ?? throw new ArgumentNullException(nameof(schema)); _formatOptions = formatOptions ?? new ParquetOptions(); _writerOptions = writerOptions ?? new WriterOptions(); PrepareFile(append); }
/// <summary> /// Writes <see cref="DataSet"/> to a target stream /// </summary> /// <param name="dataSet"><see cref="DataSet"/> to write</param> /// <param name="destination">Destination stream</param> /// <param name="compression">Compression method</param> /// <param name="formatOptions">Parquet options, optional.</param> /// <param name="writerOptions">Writer options, optional.</param> /// <param name="append">When true, assumes that this stream contains existing file and appends data to it, otherwise writes a new Parquet file.</param> public static void Write(DataSet dataSet, Stream destination, CompressionMethod compression = CompressionMethod.Gzip, ParquetOptions formatOptions = null, WriterOptions writerOptions = null, bool append = false) { using (var writer = new ParquetWriter(destination, formatOptions, writerOptions)) { writer.Write(dataSet, compression, append); } }