Example #1
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var footer = new ThriftFooter(_meta);

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc   = rg.Columns[icol];
                    string             path = cc.GetPath();
                    if (_fieldPredicates != null && !_fieldPredicates.Any(p => p.IsMatch(cc, path)))
                    {
                        continue;
                    }

                    var columnarReader = new ColumnarReader(_input, cc, footer, _formatOptions);

                    try
                    {
                        IList chunkValues = columnarReader.Read(offset, count);

                        if (!pathToValues.TryGetValue(path, out IList allValues))
                        {
                            pathToValues[path] = chunkValues;
                        }
                        else
                        {
                            foreach (object v in chunkValues)
                            {
                                allValues.Add(v);
                            }
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{path}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            Schema schema = footer.CreateModelSchema(_formatOptions);

            schema = schema.Filter(_fieldPredicates);
            var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by);
            Dictionary <string, string> customMetadata = footer.CustomMetadata;

            if (customMetadata != null)
            {
                ds.Metadata.Custom.AddRange(customMetadata);
            }
            ds.Thrift = _meta;
            return(ds);
        }
Example #2
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var           footer     = new ThriftFooter(_meta);
            var           metaParser = new FileMetadataParser(_meta);
            Schema        schema     = metaParser.ParseSchema(_formatOptions);
            SchemaElement schema2    = metaParser.ParseSchemaExperimental(_formatOptions);

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc = rg.Columns[icol];
                    SchemaElement      se = schema[cc];

                    var p = new ColumnReader(cc, se, _input, ThriftStream, _formatOptions);
                    var columnarReader = new ColumnarReader(_input, cc, footer, _formatOptions);

                    try
                    {
                        IList chunkValues = p.Read(offset, count);
                        //columnarReader.Read(offset, count);

                        if (!pathToValues.TryGetValue(se.Path, out IList allValues))
                        {
                            pathToValues[se.Path] = chunkValues;
                        }
                        else
                        {
                            foreach (object v in chunkValues)
                            {
                                allValues.Add(v);
                            }
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{se}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by);

            metaParser.AddMeta(ds);
            ds.Thrift = _meta;
            return(ds);
        }