コード例 #1
0
ファイル: ParquetWriter.cs プロジェクト: 0x0L/parquet-dotnet
        void PrepareFile(DataSet ds, bool append)
        {
            if (append)
            {
                if (!Stream.CanSeek)
                {
                    throw new IOException("destination stream must be seekable for append operations.");
                }

                ValidateFile();

                Thrift.FileMetaData fileMeta = ReadMetadata();
                _footer = new ThriftFooter(fileMeta);

                ValidateSchemasCompatible(_footer, ds);

                GoBeforeFooter();
            }
            else
            {
                if (_footer == null)
                {
                    _footer = new ThriftFooter(ds.Schema, ds.RowCount);

                    //file starts with magic
                    WriteMagic();
                }
                else
                {
                    ValidateSchemasCompatible(_footer, ds);

                    _footer.Add(ds.RowCount);
                }
            }
        }
コード例 #2
0
        private void PrepareFile(bool append)
        {
            if (append)
            {
                if (!Stream.CanSeek)
                {
                    throw new IOException("destination stream must be seekable for append operations.");
                }

                ValidateFile();

                Thrift.FileMetaData fileMeta = ReadMetadata();
                _footer = new ThriftFooter(fileMeta);

                ValidateSchemasCompatible(_footer, _schema);

                GoBeforeFooter();
            }
            else
            {
                if (_footer == null)
                {
                    _footer = new ThriftFooter(_schema, 0 /* todo: don't forget to set the total row count at the end!!! */);

                    //file starts with magic
                    WriteMagic();
                }
                else
                {
                    ValidateSchemasCompatible(_footer, _schema);

                    _footer.Add(0 /* todo: don't forget to set the total row count at the end!!! */);
                }
            }
        }
コード例 #3
0
        private void ValidateSchemasCompatible(ThriftFooter footer, Schema schema)
        {
            Schema existingSchema = footer.CreateModelSchema(_formatOptions);

            if (!schema.Equals(existingSchema))
            {
                string reason = schema.GetNotEqualsMessage(existingSchema, "appending", "existing");
                throw new ParquetException($"passed schema does not match existing file schema, reason: {reason}");
            }
        }
コード例 #4
0
ファイル: ParquetWriter.cs プロジェクト: 0x0L/parquet-dotnet
        void ValidateSchemasCompatible(ThriftFooter footer, DataSet ds)
        {
            Schema existingSchema = footer.CreateModelSchema(_formatOptions);

            if (!ds.Schema.Equals(existingSchema))
            {
                string reason = ds.Schema.GetNotEqualsMessage(existingSchema, "appending", "existing");
                throw new ParquetException($"{nameof(DataSet)} schema does not match existing file schema, reason: {reason}");
            }
        }
コード例 #5
0
        internal ParquetRowGroupReader(
            Thrift.RowGroup rowGroup,
            ThriftFooter footer,
            Stream stream, ThriftStream thriftStream,
            ParquetOptions parquetOptions)
        {
            _rowGroup       = rowGroup ?? throw new ArgumentNullException(nameof(rowGroup));
            _footer         = footer ?? throw new ArgumentNullException(nameof(footer));
            _stream         = stream ?? throw new ArgumentNullException(nameof(stream));
            _thriftStream   = thriftStream ?? throw new ArgumentNullException(nameof(thriftStream));
            _parquetOptions = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions));

            //cache chunks
            foreach (Thrift.ColumnChunk thriftChunk in _rowGroup.Columns)
            {
                string path = thriftChunk.GetPath();
                _pathToChunk[path] = thriftChunk;
            }
        }
コード例 #6
0
        internal ParquetRowGroupWriter(Schema schema,
                                       Stream stream,
                                       ThriftStream thriftStream,
                                       ThriftFooter footer,
                                       CompressionMethod compressionMethod,
                                       ParquetOptions formatOptions)
        {
            _schema            = schema ?? throw new ArgumentNullException(nameof(schema));
            _stream            = stream ?? throw new ArgumentNullException(nameof(stream));
            _thriftStream      = thriftStream ?? throw new ArgumentNullException(nameof(thriftStream));
            _footer            = footer ?? throw new ArgumentNullException(nameof(footer));
            _compressionMethod = compressionMethod;
            _formatOptions     = formatOptions;

            _thriftRowGroup         = _footer.AddRowGroup();
            _rgStartPos             = _stream.Position;
            _thriftRowGroup.Columns = new List <Thrift.ColumnChunk>();
            _thschema = _footer.GetWriteableSchema();
        }
コード例 #7
0
        /// <summary>
        /// Creates an instance from input stream
        /// </summary>
        /// <param name="input">Input stream, must be readable and seekable</param>
        /// <param name="parquetOptions">Optional reader options</param>
        /// <param name="leaveStreamOpen">When true, leaves the stream passed in <paramref name="input"/> open after disposing the reader.</param>
        /// <exception cref="ArgumentNullException">input</exception>
        /// <exception cref="ArgumentException">stream must be readable and seekable - input</exception>
        /// <exception cref="IOException">not a Parquet file (size too small)</exception>
        public ParquetReader(Stream input, ParquetOptions parquetOptions = null, bool leaveStreamOpen = true) : this(input, leaveStreamOpen)
        {
            if (!input.CanRead || !input.CanSeek)
            {
                throw new ArgumentException("stream must be readable and seekable", nameof(input));
            }
            if (_input.Length <= 8)
            {
                throw new IOException("not a Parquet file (size too small)");
            }

            ValidateFile();
            _parquetOptions = parquetOptions ?? new ParquetOptions();

            //read metadata instantly, now
            _meta   = ReadMetadata();
            _footer = new ThriftFooter(_meta);

            ParquetEventSource.Current.OpenStream(input.Length, leaveStreamOpen, _meta.Row_groups.Count, _meta.Num_rows);

            InitRowGroupReaders();
        }
コード例 #8
0
        /// <summary>
        /// Decodes raw bytes from <see cref="Thrift.Statistics"/> into a CLR value
        /// </summary>
        public static object DecodeSingleStatsValue(this Thrift.FileMetaData fileMeta, Thrift.ColumnChunk columnChunk, byte[] rawBytes)
        {
            if (rawBytes == null || rawBytes.Length == 0)
            {
                return(null);
            }

            var footer = new ThriftFooter(fileMeta);

            Thrift.SchemaElement schema = footer.GetSchemaElement(columnChunk);

            IDataTypeHandler handler = DataTypeFactory.Match(schema, new ParquetOptions {
                TreatByteArrayAsString = true
            });

            using (var ms = new MemoryStream(rawBytes))
                using (var reader = new BinaryReader(ms))
                {
                    object value = handler.Read(reader, schema, rawBytes.Length);
                    return(value);
                }
        }
コード例 #9
0
        /// <summary>
        /// Creates an instance from input stream
        /// </summary>
        /// <param name="input">Input stream, must be readable and seekable</param>
        /// <param name="parquetOptions">Optional reader options</param>
        /// <param name="readerOptions">The reader options.</param>
        /// <exception cref="ArgumentNullException">input</exception>
        /// <exception cref="ArgumentException">stream must be readable and seekable - input</exception>
        /// <exception cref="IOException">not a Parquet file (size too small)</exception>
        public ParquetReader3(Stream input, ParquetOptions parquetOptions = null, ReaderOptions readerOptions = null) : base(input)
        {
            _input = input ?? throw new ArgumentNullException(nameof(input));
            if (!input.CanRead || !input.CanSeek)
            {
                throw new ArgumentException("stream must be readable and seekable", nameof(input));
            }
            if (_input.Length <= 8)
            {
                throw new IOException("not a Parquet file (size too small)");
            }

            ValidateFile();
            _parquetOptions = parquetOptions ?? new ParquetOptions();
            _readerOptions  = readerOptions ?? new ReaderOptions();

            //read metadata instantly, now
            _meta   = ReadMetadata();
            _footer = new ThriftFooter(_meta);

            InitRowGroupReaders();
        }
コード例 #10
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var footer = new ThriftFooter(_meta);

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc   = rg.Columns[icol];
                    string             path = cc.GetPath();
                    if (_fieldPredicates != null && !_fieldPredicates.Any(p => p.IsMatch(cc, path)))
                    {
                        continue;
                    }

                    var columnarReader = new ColumnarReader(_input, cc, footer, _formatOptions);

                    try
                    {
                        IList chunkValues = columnarReader.Read(offset, count);

                        if (!pathToValues.TryGetValue(path, out IList allValues))
                        {
                            pathToValues[path] = chunkValues;
                        }
                        else
                        {
                            foreach (object v in chunkValues)
                            {
                                allValues.Add(v);
                            }
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{path}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            Schema schema = footer.CreateModelSchema(_formatOptions);

            schema = schema.Filter(_fieldPredicates);
            var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by);
            Dictionary <string, string> customMetadata = footer.CustomMetadata;

            if (customMetadata != null)
            {
                ds.Metadata.Custom.AddRange(customMetadata);
            }
            ds.Thrift = _meta;
            return(ds);
        }
コード例 #11
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var           footer     = new ThriftFooter(_meta);
            var           metaParser = new FileMetadataParser(_meta);
            Schema        schema     = metaParser.ParseSchema(_formatOptions);
            SchemaElement schema2    = metaParser.ParseSchemaExperimental(_formatOptions);

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc = rg.Columns[icol];
                    SchemaElement      se = schema[cc];

                    var p = new ColumnReader(cc, se, _input, ThriftStream, _formatOptions);
                    var columnarReader = new ColumnarReader(_input, cc, footer, _formatOptions);

                    try
                    {
                        IList chunkValues = p.Read(offset, count);
                        //columnarReader.Read(offset, count);

                        if (!pathToValues.TryGetValue(se.Path, out IList allValues))
                        {
                            pathToValues[se.Path] = chunkValues;
                        }
                        else
                        {
                            foreach (object v in chunkValues)
                            {
                                allValues.Add(v);
                            }
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{se}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by);

            metaParser.AddMeta(ds);
            ds.Thrift = _meta;
            return(ds);
        }