/// <summary> /// Test read, to be defined /// </summary> public DataSet Read() { _meta = ReadMetadata(); var ds = new DataSet(new Schema(_meta)); var cols = new List <IList>(); foreach (Thrift.RowGroup rg in _meta.Row_groups) { foreach (Thrift.ColumnChunk cc in rg.Columns) { var p = new PColumn(cc, ds.Schema, _input, _thrift, _options); string columnName = string.Join(".", cc.Meta_data.Path_in_schema); try { IList column = p.Read(columnName); cols.Add(column); } catch (Exception ex) { throw new ParquetException($"fatal error reading column '{columnName}'", ex); } } } ds.AddColumnar(cols); return(ds); }
private void PrepareFile(bool append) { if (append) { if (!Stream.CanSeek) { throw new IOException("destination stream must be seekable for append operations."); } ValidateFile(); Thrift.FileMetaData fileMeta = ReadMetadata(); _footer = new ThriftFooter(fileMeta); ValidateSchemasCompatible(_footer, _schema); GoBeforeFooter(); } else { if (_footer == null) { _footer = new ThriftFooter(_schema, 0 /* todo: don't forget to set the total row count at the end!!! */); //file starts with magic WriteMagic(); } else { ValidateSchemasCompatible(_footer, _schema); _footer.Add(0 /* todo: don't forget to set the total row count at the end!!! */); } } }
void PrepareFile(DataSet ds, bool append) { if (append) { if (!Stream.CanSeek) { throw new IOException("destination stream must be seekable for append operations."); } ValidateFile(); Thrift.FileMetaData fileMeta = ReadMetadata(); _footer = new ThriftFooter(fileMeta); ValidateSchemasCompatible(_footer, ds); GoBeforeFooter(); } else { if (_footer == null) { _footer = new ThriftFooter(ds.Schema, ds.RowCount); //file starts with magic WriteMagic(); } else { ValidateSchemasCompatible(_footer, ds); _footer.Add(ds.RowCount); } } }
private void PrepareFile(DataSet ds, bool append) { if (append) { if (!_output.CanSeek) { throw new IOException("destination stream must be seekable for append operations."); } ValidateFile(); Thrift.FileMetaData fileMeta = ReadMetadata(); _meta.SetMeta(fileMeta); Schema existingSchema = new FileMetadataParser(fileMeta).ParseSchema(_formatOptions); if (!ds.Schema.Equals(existingSchema)) { throw new ParquetException($"{nameof(DataSet)} schema does not match existing file schema"); } GoBeforeFooter(); } else { if (_existingSchema == null) { _existingSchema = ds.Schema; //file starts with magic WriteMagic(); _meta.AddSchema(ds); } else { if (!_existingSchema.Equals(ds.Schema)) { throw new ParquetException($"expeted schema {_existingSchema} but found {ds.Schema}."); } } } }
/// <summary> /// Creates an instance from input stream /// </summary> /// <param name="input">Input stream, must be readable and seekable</param> /// <param name="parquetOptions">Optional reader options</param> /// <param name="leaveStreamOpen">When true, leaves the stream passed in <paramref name="input"/> open after disposing the reader.</param> /// <exception cref="ArgumentNullException">input</exception> /// <exception cref="ArgumentException">stream must be readable and seekable - input</exception> /// <exception cref="IOException">not a Parquet file (size too small)</exception> public ParquetReader(Stream input, ParquetOptions parquetOptions = null, bool leaveStreamOpen = true) : this(input, leaveStreamOpen) { if (!input.CanRead || !input.CanSeek) { throw new ArgumentException("stream must be readable and seekable", nameof(input)); } if (_input.Length <= 8) { throw new IOException("not a Parquet file (size too small)"); } ValidateFile(); _parquetOptions = parquetOptions ?? new ParquetOptions(); //read metadata instantly, now _meta = ReadMetadata(); _footer = new ThriftFooter(_meta); ParquetEventSource.Current.OpenStream(input.Length, leaveStreamOpen, _meta.Row_groups.Count, _meta.Num_rows); InitRowGroupReaders(); }
/// <summary> /// Decodes raw bytes from <see cref="Thrift.Statistics"/> into a CLR value /// </summary> public static object DecodeSingleStatsValue(this Thrift.FileMetaData fileMeta, Thrift.ColumnChunk columnChunk, byte[] rawBytes) { if (rawBytes == null || rawBytes.Length == 0) { return(null); } var footer = new ThriftFooter(fileMeta); Thrift.SchemaElement schema = footer.GetSchemaElement(columnChunk); IDataTypeHandler handler = DataTypeFactory.Match(schema, new ParquetOptions { TreatByteArrayAsString = true }); using (var ms = new MemoryStream(rawBytes)) using (var reader = new BinaryReader(ms)) { object value = handler.Read(reader, schema, rawBytes.Length); return(value); } }
/// <summary> /// Creates an instance from input stream /// </summary> /// <param name="input">Input stream, must be readable and seekable</param> /// <param name="parquetOptions">Optional reader options</param> /// <param name="readerOptions">The reader options.</param> /// <exception cref="ArgumentNullException">input</exception> /// <exception cref="ArgumentException">stream must be readable and seekable - input</exception> /// <exception cref="IOException">not a Parquet file (size too small)</exception> public ParquetReader3(Stream input, ParquetOptions parquetOptions = null, ReaderOptions readerOptions = null) : base(input) { _input = input ?? throw new ArgumentNullException(nameof(input)); if (!input.CanRead || !input.CanSeek) { throw new ArgumentException("stream must be readable and seekable", nameof(input)); } if (_input.Length <= 8) { throw new IOException("not a Parquet file (size too small)"); } ValidateFile(); _parquetOptions = parquetOptions ?? new ParquetOptions(); _readerOptions = readerOptions ?? new ReaderOptions(); //read metadata instantly, now _meta = ReadMetadata(); _footer = new ThriftFooter(_meta); InitRowGroupReaders(); }
/// <summary> /// Test read, to be defined /// </summary> public DataSet Read() { _readerOptions.Validate(); _meta = ReadMetadata(); var metaParser = new FileMetadataParser(_meta); Schema schema = metaParser.ParseSchema(_formatOptions); var pathToValues = new Dictionary <string, IList>(); long pos = 0; long rowsRead = 0; foreach (Thrift.RowGroup rg in _meta.Row_groups) { //check whether to skip RG completely if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) || (_readerOptions.Offset > pos + rg.Num_rows - 1)) { pos += rg.Num_rows; continue; } long offset = Math.Max(0, _readerOptions.Offset - pos); long count = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows); for (int icol = 0; icol < rg.Columns.Count; icol++) { Thrift.ColumnChunk cc = rg.Columns[icol]; SchemaElement se = schema[cc]; var p = new ColumnReader(cc, se, _input, ThriftStream, _formatOptions); try { IList chunkValues = p.Read(offset, count); if (!pathToValues.TryGetValue(se.Path, out IList allValues)) { pathToValues[se.Path] = chunkValues; } else { allValues.AddRange(chunkValues); } if (icol == 0) { //todo: this may not work rowsRead += chunkValues.Count; } } catch (Exception ex) { throw new ParquetException($"fatal error reading column '{se}'", ex); } } pos += rg.Num_rows; } return(new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by)); }
/// <summary> /// Test read, to be defined /// </summary> public DataSet Read() { _readerOptions.Validate(); _meta = ReadMetadata(); var footer = new ThriftFooter(_meta); var pathToValues = new Dictionary <string, IList>(); long pos = 0; long rowsRead = 0; foreach (Thrift.RowGroup rg in _meta.Row_groups) { //check whether to skip RG completely if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) || (_readerOptions.Offset > pos + rg.Num_rows - 1)) { pos += rg.Num_rows; continue; } long offset = Math.Max(0, _readerOptions.Offset - pos); long count = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows); for (int icol = 0; icol < rg.Columns.Count; icol++) { Thrift.ColumnChunk cc = rg.Columns[icol]; string path = cc.GetPath(); if (_fieldPredicates != null && !_fieldPredicates.Any(p => p.IsMatch(cc, path))) { continue; } var columnarReader = new ColumnarReader(_input, cc, footer, _formatOptions); try { IList chunkValues = columnarReader.Read(offset, count); if (!pathToValues.TryGetValue(path, out IList allValues)) { pathToValues[path] = chunkValues; } else { foreach (object v in chunkValues) { allValues.Add(v); } } if (icol == 0) { //todo: this may not work rowsRead += chunkValues.Count; } } catch (Exception ex) { throw new ParquetException($"fatal error reading column '{path}'", ex); } } pos += rg.Num_rows; } Schema schema = footer.CreateModelSchema(_formatOptions); schema = schema.Filter(_fieldPredicates); var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by); Dictionary <string, string> customMetadata = footer.CustomMetadata; if (customMetadata != null) { ds.Metadata.Custom.AddRange(customMetadata); } ds.Thrift = _meta; return(ds); }