Exemple #1
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _meta = ReadMetadata();
            var ds   = new DataSet(new Schema(_meta));
            var cols = new List <IList>();

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                foreach (Thrift.ColumnChunk cc in rg.Columns)
                {
                    var    p          = new PColumn(cc, ds.Schema, _input, _thrift, _options);
                    string columnName = string.Join(".", cc.Meta_data.Path_in_schema);

                    try
                    {
                        IList column = p.Read(columnName);
                        cols.Add(column);
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{columnName}'", ex);
                    }
                }
            }

            ds.AddColumnar(cols);

            return(ds);
        }
        private void PrepareFile(bool append)
        {
            if (append)
            {
                if (!Stream.CanSeek)
                {
                    throw new IOException("destination stream must be seekable for append operations.");
                }

                ValidateFile();

                Thrift.FileMetaData fileMeta = ReadMetadata();
                _footer = new ThriftFooter(fileMeta);

                ValidateSchemasCompatible(_footer, _schema);

                GoBeforeFooter();
            }
            else
            {
                if (_footer == null)
                {
                    _footer = new ThriftFooter(_schema, 0 /* todo: don't forget to set the total row count at the end!!! */);

                    //file starts with magic
                    WriteMagic();
                }
                else
                {
                    ValidateSchemasCompatible(_footer, _schema);

                    _footer.Add(0 /* todo: don't forget to set the total row count at the end!!! */);
                }
            }
        }
Exemple #3
0
        void PrepareFile(DataSet ds, bool append)
        {
            if (append)
            {
                if (!Stream.CanSeek)
                {
                    throw new IOException("destination stream must be seekable for append operations.");
                }

                ValidateFile();

                Thrift.FileMetaData fileMeta = ReadMetadata();
                _footer = new ThriftFooter(fileMeta);

                ValidateSchemasCompatible(_footer, ds);

                GoBeforeFooter();
            }
            else
            {
                if (_footer == null)
                {
                    _footer = new ThriftFooter(ds.Schema, ds.RowCount);

                    //file starts with magic
                    WriteMagic();
                }
                else
                {
                    ValidateSchemasCompatible(_footer, ds);

                    _footer.Add(ds.RowCount);
                }
            }
        }
        private void PrepareFile(DataSet ds, bool append)
        {
            if (append)
            {
                if (!_output.CanSeek)
                {
                    throw new IOException("destination stream must be seekable for append operations.");
                }

                ValidateFile();

                Thrift.FileMetaData fileMeta = ReadMetadata();
                _meta.SetMeta(fileMeta);

                Schema existingSchema = new FileMetadataParser(fileMeta).ParseSchema(_formatOptions);

                if (!ds.Schema.Equals(existingSchema))
                {
                    throw new ParquetException($"{nameof(DataSet)} schema does not match existing file schema");
                }

                GoBeforeFooter();
            }
            else
            {
                if (_existingSchema == null)
                {
                    _existingSchema = ds.Schema;

                    //file starts with magic
                    WriteMagic();

                    _meta.AddSchema(ds);
                }
                else
                {
                    if (!_existingSchema.Equals(ds.Schema))
                    {
                        throw new ParquetException($"expeted schema {_existingSchema} but found {ds.Schema}.");
                    }
                }
            }
        }
Exemple #5
0
        /// <summary>
        /// Creates an instance from input stream
        /// </summary>
        /// <param name="input">Input stream, must be readable and seekable</param>
        /// <param name="parquetOptions">Optional reader options</param>
        /// <param name="leaveStreamOpen">When true, leaves the stream passed in <paramref name="input"/> open after disposing the reader.</param>
        /// <exception cref="ArgumentNullException">input</exception>
        /// <exception cref="ArgumentException">stream must be readable and seekable - input</exception>
        /// <exception cref="IOException">not a Parquet file (size too small)</exception>
        public ParquetReader(Stream input, ParquetOptions parquetOptions = null, bool leaveStreamOpen = true) : this(input, leaveStreamOpen)
        {
            if (!input.CanRead || !input.CanSeek)
            {
                throw new ArgumentException("stream must be readable and seekable", nameof(input));
            }
            if (_input.Length <= 8)
            {
                throw new IOException("not a Parquet file (size too small)");
            }

            ValidateFile();
            _parquetOptions = parquetOptions ?? new ParquetOptions();

            //read metadata instantly, now
            _meta   = ReadMetadata();
            _footer = new ThriftFooter(_meta);

            ParquetEventSource.Current.OpenStream(input.Length, leaveStreamOpen, _meta.Row_groups.Count, _meta.Num_rows);

            InitRowGroupReaders();
        }
        /// <summary>
        /// Decodes raw bytes from <see cref="Thrift.Statistics"/> into a CLR value
        /// </summary>
        public static object DecodeSingleStatsValue(this Thrift.FileMetaData fileMeta, Thrift.ColumnChunk columnChunk, byte[] rawBytes)
        {
            if (rawBytes == null || rawBytes.Length == 0)
            {
                return(null);
            }

            var footer = new ThriftFooter(fileMeta);

            Thrift.SchemaElement schema = footer.GetSchemaElement(columnChunk);

            IDataTypeHandler handler = DataTypeFactory.Match(schema, new ParquetOptions {
                TreatByteArrayAsString = true
            });

            using (var ms = new MemoryStream(rawBytes))
                using (var reader = new BinaryReader(ms))
                {
                    object value = handler.Read(reader, schema, rawBytes.Length);
                    return(value);
                }
        }
Exemple #7
0
        /// <summary>
        /// Creates an instance from input stream
        /// </summary>
        /// <param name="input">Input stream, must be readable and seekable</param>
        /// <param name="parquetOptions">Optional reader options</param>
        /// <param name="readerOptions">The reader options.</param>
        /// <exception cref="ArgumentNullException">input</exception>
        /// <exception cref="ArgumentException">stream must be readable and seekable - input</exception>
        /// <exception cref="IOException">not a Parquet file (size too small)</exception>
        public ParquetReader3(Stream input, ParquetOptions parquetOptions = null, ReaderOptions readerOptions = null) : base(input)
        {
            _input = input ?? throw new ArgumentNullException(nameof(input));
            if (!input.CanRead || !input.CanSeek)
            {
                throw new ArgumentException("stream must be readable and seekable", nameof(input));
            }
            if (_input.Length <= 8)
            {
                throw new IOException("not a Parquet file (size too small)");
            }

            ValidateFile();
            _parquetOptions = parquetOptions ?? new ParquetOptions();
            _readerOptions  = readerOptions ?? new ReaderOptions();

            //read metadata instantly, now
            _meta   = ReadMetadata();
            _footer = new ThriftFooter(_meta);

            InitRowGroupReaders();
        }
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var    metaParser = new FileMetadataParser(_meta);
            Schema schema     = metaParser.ParseSchema(_formatOptions);

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc = rg.Columns[icol];
                    SchemaElement      se = schema[cc];

                    var p = new ColumnReader(cc, se, _input, ThriftStream, _formatOptions);

                    try
                    {
                        IList chunkValues = p.Read(offset, count);

                        if (!pathToValues.TryGetValue(se.Path, out IList allValues))
                        {
                            pathToValues[se.Path] = chunkValues;
                        }
                        else
                        {
                            allValues.AddRange(chunkValues);
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{se}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            return(new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by));
        }
Exemple #9
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var footer = new ThriftFooter(_meta);

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc   = rg.Columns[icol];
                    string             path = cc.GetPath();
                    if (_fieldPredicates != null && !_fieldPredicates.Any(p => p.IsMatch(cc, path)))
                    {
                        continue;
                    }

                    var columnarReader = new ColumnarReader(_input, cc, footer, _formatOptions);

                    try
                    {
                        IList chunkValues = columnarReader.Read(offset, count);

                        if (!pathToValues.TryGetValue(path, out IList allValues))
                        {
                            pathToValues[path] = chunkValues;
                        }
                        else
                        {
                            foreach (object v in chunkValues)
                            {
                                allValues.Add(v);
                            }
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{path}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            Schema schema = footer.CreateModelSchema(_formatOptions);

            schema = schema.Filter(_fieldPredicates);
            var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by);
            Dictionary <string, string> customMetadata = footer.CustomMetadata;

            if (customMetadata != null)
            {
                ds.Metadata.Custom.AddRange(customMetadata);
            }
            ds.Thrift = _meta;
            return(ds);
        }