/// <summary> /// Writes next data column to parquet stream. Note that columns must be written in the order they are declared in the /// file schema. /// </summary> /// <param name="column"></param> public void WriteColumn(DataColumn column) { if (column == null) { throw new ArgumentNullException(nameof(column)); } if (RowCount == null) { if (column.Data.Length > 0 || column.Field.MaxRepetitionLevel == 0) { RowCount = column.CalculateRowCount(); } } Thrift.SchemaElement tse = _thschema[_colIdx]; if (!column.Field.Equals(tse)) { throw new ArgumentException($"cannot write this column, expected '{tse.Name}', passed: '{column.Field.Name}'", nameof(column)); } IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions); _colIdx += 1; List <string> path = _footer.GetPath(tse); var writer = new DataColumnWriter(_stream, _thriftStream, _footer, tse, _compressionMethod, _compressionLevel, (int)(RowCount ?? 0)); Thrift.ColumnChunk chunk = writer.Write(path, column, dataTypeHandler); _thriftRowGroup.Columns.Add(chunk); }
/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> /// <param name="append">When true, appends to the file, otherwise creates a new file.</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false) { PrepareFile(dataSet, append); int offset = 0; int count; do { count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset); Thrift.RowGroup rg = _meta.AddRowGroup(); long rgStartPos = Stream.Position; rg.Columns = new List <Thrift.ColumnChunk>(); foreach (SchemaElement se in dataSet.Schema.Flatten()) { var cw = new ColumnWriter(Stream, ThriftStream, _meta, se, compression, _formatOptions, _writerOptions); IList values = dataSet.GetColumn(se, offset, count); Thrift.ColumnChunk chunk = cw.Write(offset, count, values); rg.Columns.Add(chunk); } //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers //luckily ColumnChunk already contains sizes of page+header in it's meta rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size); rg.Num_rows = count; offset += _writerOptions.RowGroupsSize; }while (offset < dataSet.Count); _dataWritten = true; }
private Thrift.ColumnChunk Write(SchemaElement schema, IList values, CompressionMethod compression, ColumnStats stats) { Thrift.ColumnChunk chunk = _meta.AddColumnChunk(compression, _output, schema, values.Count); Thrift.PageHeader ph = _meta.CreateDataPage(values.Count); List <PageTag> pages = WriteValues(schema, values, ph, compression, stats); //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
private Thrift.ColumnChunk Write(SchemaElement schema, IList values, CompressionMethod compression, ColumnStats stats) { Thrift.ColumnChunk chunk = _meta.AddColumnChunk(compression, _output, schema, values.Count); var ph = new Thrift.PageHeader(Thrift.PageType.DATA_PAGE, 0, 0); ph.Data_page_header = new Thrift.DataPageHeader { Encoding = Thrift.Encoding.PLAIN, Definition_level_encoding = Thrift.Encoding.RLE, Repetition_level_encoding = Thrift.Encoding.BIT_PACKED, Num_values = values.Count }; WriteValues(schema, values, ph, compression, stats); return(chunk); }
/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> /// <param name="append">When true, appends to the file, otherwise creates a new file.</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false) { PrepareFile(dataSet, append); _footer.CustomMetadata = dataSet.Metadata.Custom; int offset = 0; int count; List <Thrift.SchemaElement> writeableSchema = _footer.GetWriteableSchema().ToList(); do { count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset); Thrift.RowGroup rg = _footer.AddRowGroup(); long rgStartPos = Stream.Position; rg.Columns = new List <Thrift.ColumnChunk>(); foreach (Thrift.SchemaElement tse in writeableSchema) { List <string> path = _footer.GetPath(tse); string flatPath = string.Join(Schema.PathSeparator, path); var cw = new ColumnarWriter(Stream, ThriftStream, _footer, tse, path, compression, _formatOptions, _writerOptions); IList values = dataSet.GetColumn(flatPath, offset, count); Thrift.ColumnChunk chunk = cw.Write(offset, count, values); rg.Columns.Add(chunk); } //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers //luckily ColumnChunk already contains sizes of page+header in it's meta rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size); rg.Num_rows = count; offset += _writerOptions.RowGroupsSize; }while (offset < dataSet.Count); _dataWritten = true; }
/// <summary> /// Test read, to be defined /// </summary> public DataSet Read() { _readerOptions.Validate(); _meta = ReadMetadata(); var metaParser = new FileMetadataParser(_meta); Schema schema = metaParser.ParseSchema(_formatOptions); var pathToValues = new Dictionary <string, IList>(); long pos = 0; long rowsRead = 0; foreach (Thrift.RowGroup rg in _meta.Row_groups) { //check whether to skip RG completely if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) || (_readerOptions.Offset > pos + rg.Num_rows - 1)) { pos += rg.Num_rows; continue; } long offset = Math.Max(0, _readerOptions.Offset - pos); long count = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows); for (int icol = 0; icol < rg.Columns.Count; icol++) { Thrift.ColumnChunk cc = rg.Columns[icol]; SchemaElement se = schema[cc]; var p = new ColumnReader(cc, se, _input, ThriftStream, _formatOptions); try { IList chunkValues = p.Read(offset, count); if (!pathToValues.TryGetValue(se.Path, out IList allValues)) { pathToValues[se.Path] = chunkValues; } else { allValues.AddRange(chunkValues); } if (icol == 0) { //todo: this may not work rowsRead += chunkValues.Count; } } catch (Exception ex) { throw new ParquetException($"fatal error reading column '{se}'", ex); } } pos += rg.Num_rows; } return(new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by)); }
/// <summary> /// Test read, to be defined /// </summary> public DataSet Read() { _readerOptions.Validate(); _meta = ReadMetadata(); var footer = new ThriftFooter(_meta); var pathToValues = new Dictionary <string, IList>(); long pos = 0; long rowsRead = 0; foreach (Thrift.RowGroup rg in _meta.Row_groups) { //check whether to skip RG completely if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) || (_readerOptions.Offset > pos + rg.Num_rows - 1)) { pos += rg.Num_rows; continue; } long offset = Math.Max(0, _readerOptions.Offset - pos); long count = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows); for (int icol = 0; icol < rg.Columns.Count; icol++) { Thrift.ColumnChunk cc = rg.Columns[icol]; string path = cc.GetPath(); if (_fieldPredicates != null && !_fieldPredicates.Any(p => p.IsMatch(cc, path))) { continue; } var columnarReader = new ColumnarReader(_input, cc, footer, _formatOptions); try { IList chunkValues = columnarReader.Read(offset, count); if (!pathToValues.TryGetValue(path, out IList allValues)) { pathToValues[path] = chunkValues; } else { foreach (object v in chunkValues) { allValues.Add(v); } } if (icol == 0) { //todo: this may not work rowsRead += chunkValues.Count; } } catch (Exception ex) { throw new ParquetException($"fatal error reading column '{path}'", ex); } } pos += rg.Num_rows; } Schema schema = footer.CreateModelSchema(_formatOptions); schema = schema.Filter(_fieldPredicates); var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by); Dictionary <string, string> customMetadata = footer.CustomMetadata; if (customMetadata != null) { ds.Metadata.Custom.AddRange(customMetadata); } ds.Thrift = _meta; return(ds); }
public async Task ReadAsync(TProtocol iprot, CancellationToken cancellationToken) { iprot.IncrementRecursionDepth(); try { bool isset_columns = false; bool isset_total_byte_size = false; bool isset_num_rows = false; TField field; await iprot.ReadStructBeginAsync(cancellationToken); while (true) { field = await iprot.ReadFieldBeginAsync(cancellationToken); if (field.Type == TType.Stop) { break; } switch (field.ID) { case 1: if (field.Type == TType.List) { { Columns = new List <ColumnChunk>(); TList _list16 = await iprot.ReadListBeginAsync(cancellationToken); for (int _i17 = 0; _i17 < _list16.Count; ++_i17) { ColumnChunk _elem18; _elem18 = new ColumnChunk(); await _elem18.ReadAsync(iprot, cancellationToken); Columns.Add(_elem18); } await iprot.ReadListEndAsync(cancellationToken); } isset_columns = true; } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; case 2: if (field.Type == TType.I64) { Total_byte_size = await iprot.ReadI64Async(cancellationToken); isset_total_byte_size = true; } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; case 3: if (field.Type == TType.I64) { Num_rows = await iprot.ReadI64Async(cancellationToken); isset_num_rows = true; } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; case 4: if (field.Type == TType.List) { { Sorting_columns = new List <SortingColumn>(); TList _list19 = await iprot.ReadListBeginAsync(cancellationToken); for (int _i20 = 0; _i20 < _list19.Count; ++_i20) { SortingColumn _elem21; _elem21 = new SortingColumn(); await _elem21.ReadAsync(iprot, cancellationToken); Sorting_columns.Add(_elem21); } await iprot.ReadListEndAsync(cancellationToken); } } else { await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); } break; default: await TProtocolUtil.SkipAsync(iprot, field.Type, cancellationToken); break; } await iprot.ReadFieldEndAsync(cancellationToken); } await iprot.ReadStructEndAsync(cancellationToken); if (!isset_columns) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_total_byte_size) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_num_rows) { throw new TProtocolException(TProtocolException.INVALID_DATA); } } finally { iprot.DecrementRecursionDepth(); } }
/// <summary> /// Decodes raw bytes from <see cref="Thrift.Statistics"/> into a CLR value /// </summary> public static object DecodeSingleStatsValue(this Thrift.FileMetaData fileMeta, Thrift.ColumnChunk columnChunk, byte[] rawBytes) { if (rawBytes == null || rawBytes.Length == 0) { return(null); } var footer = new ThriftFooter(fileMeta); Thrift.SchemaElement schema = footer.GetSchemaElement(columnChunk); IDataTypeHandler handler = DataTypeFactory.Match(schema, new ParquetOptions { TreatByteArrayAsString = true }); using (var ms = new MemoryStream(rawBytes)) using (var reader = new BinaryReader(ms)) { object value = handler.Read(reader, schema, rawBytes.Length); return(value); } }
public static string GetPath(this Thrift.ColumnChunk columnChunk) { return(string.Join(Schema.PathSeparator, columnChunk.Meta_data.Path_in_schema)); }
public void Read(TProtocol iprot) { iprot.IncrementRecursionDepth(); try { bool isset_columns = false; bool isset_total_byte_size = false; bool isset_num_rows = false; TField field; iprot.ReadStructBegin(); while (true) { field = iprot.ReadFieldBegin(); if (field.Type == TType.Stop) { break; } switch (field.ID) { case 1: if (field.Type == TType.List) { { Columns = new List <ColumnChunk>(); TList _list20 = iprot.ReadListBegin(); for (int _i21 = 0; _i21 < _list20.Count; ++_i21) { ColumnChunk _elem22; _elem22 = new ColumnChunk(); _elem22.Read(iprot); Columns.Add(_elem22); } iprot.ReadListEnd(); } isset_columns = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 2: if (field.Type == TType.I64) { Total_byte_size = iprot.ReadI64(); isset_total_byte_size = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 3: if (field.Type == TType.I64) { Num_rows = iprot.ReadI64(); isset_num_rows = true; } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 4: if (field.Type == TType.List) { { Sorting_columns = new List <SortingColumn>(); TList _list23 = iprot.ReadListBegin(); for (int _i24 = 0; _i24 < _list23.Count; ++_i24) { SortingColumn _elem25; _elem25 = new SortingColumn(); _elem25.Read(iprot); Sorting_columns.Add(_elem25); } iprot.ReadListEnd(); } } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 5: if (field.Type == TType.I64) { File_offset = iprot.ReadI64(); } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 6: if (field.Type == TType.I64) { Total_compressed_size = iprot.ReadI64(); } else { TProtocolUtil.Skip(iprot, field.Type); } break; case 7: if (field.Type == TType.I16) { Ordinal = iprot.ReadI16(); } else { TProtocolUtil.Skip(iprot, field.Type); } break; default: TProtocolUtil.Skip(iprot, field.Type); break; } iprot.ReadFieldEnd(); } iprot.ReadStructEnd(); if (!isset_columns) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_total_byte_size) { throw new TProtocolException(TProtocolException.INVALID_DATA); } if (!isset_num_rows) { throw new TProtocolException(TProtocolException.INVALID_DATA); } } finally { iprot.DecrementRecursionDepth(); } }