Esempio n. 1
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _meta = ReadMetadata();
            var ds   = new DataSet(new Schema(_meta));
            var cols = new List <IList>();

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                foreach (Thrift.ColumnChunk cc in rg.Columns)
                {
                    var    p          = new PColumn(cc, ds.Schema, _input, _thrift, _options);
                    string columnName = string.Join(".", cc.Meta_data.Path_in_schema);

                    try
                    {
                        IList column = p.Read(columnName);
                        cols.Add(column);
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{columnName}'", ex);
                    }
                }
            }

            ds.AddColumnar(cols);

            return(ds);
        }
Esempio n. 2
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var    metaParser = new FileMetadataParser(_meta);
            Schema schema     = metaParser.ParseSchema(_formatOptions);

            if (schema.HasNestedElements)
            {
                throw new NotSupportedException("nested structures are not yet supported");
            }

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc = rg.Columns[icol];
                    SchemaElement      se = schema[cc];

                    var p = new PColumn(cc, se, _input, ThriftStream, _formatOptions);

                    try
                    {
                        IList chunkValues = p.Read(offset, count);

                        if (!pathToValues.TryGetValue(se.Path, out IList allValues))
                        {
                            pathToValues[se.Path] = chunkValues;
                        }
                        else
                        {
                            allValues.AddRange(chunkValues);
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{se}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            var     merger = new RecursiveMerge(schema);
            DataSet ds     = merger.Merge(pathToValues);

            ds.TotalRowCount      = _meta.Num_rows;
            ds.Metadata.CreatedBy = _meta.Created_by;

            return(ds);
        }