/// <summary> /// Test read, to be defined /// </summary> public DataSet Read() { _readerOptions.Validate(); _meta = ReadMetadata(); var metaParser = new FileMetadataParser(_meta); Schema schema = metaParser.ParseSchema(_formatOptions); var pathToValues = new Dictionary <string, IList>(); long pos = 0; long rowsRead = 0; foreach (Thrift.RowGroup rg in _meta.Row_groups) { //check whether to skip RG completely if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) || (_readerOptions.Offset > pos + rg.Num_rows - 1)) { pos += rg.Num_rows; continue; } long offset = Math.Max(0, _readerOptions.Offset - pos); long count = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows); for (int icol = 0; icol < rg.Columns.Count; icol++) { Thrift.ColumnChunk cc = rg.Columns[icol]; SchemaElement se = schema[cc]; var p = new ColumnReader(cc, se, _input, ThriftStream, _formatOptions); try { IList chunkValues = p.Read(offset, count); if (!pathToValues.TryGetValue(se.Path, out IList allValues)) { pathToValues[se.Path] = chunkValues; } else { allValues.AddRange(chunkValues); } if (icol == 0) { //todo: this may not work rowsRead += chunkValues.Count; } } catch (Exception ex) { throw new ParquetException($"fatal error reading column '{se}'", ex); } } pos += rg.Num_rows; } return(new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by)); }
/// <summary> /// Test read, to be defined /// </summary> public DataSet Read() { _readerOptions.Validate(); _meta = ReadMetadata(); var footer = new ThriftFooter(_meta); var pathToValues = new Dictionary <string, IList>(); long pos = 0; long rowsRead = 0; foreach (Thrift.RowGroup rg in _meta.Row_groups) { //check whether to skip RG completely if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) || (_readerOptions.Offset > pos + rg.Num_rows - 1)) { pos += rg.Num_rows; continue; } long offset = Math.Max(0, _readerOptions.Offset - pos); long count = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows); for (int icol = 0; icol < rg.Columns.Count; icol++) { Thrift.ColumnChunk cc = rg.Columns[icol]; string path = cc.GetPath(); if (_fieldPredicates != null && !_fieldPredicates.Any(p => p.IsMatch(cc, path))) { continue; } var columnarReader = new ColumnarReader(_input, cc, footer, _formatOptions); try { IList chunkValues = columnarReader.Read(offset, count); if (!pathToValues.TryGetValue(path, out IList allValues)) { pathToValues[path] = chunkValues; } else { foreach (object v in chunkValues) { allValues.Add(v); } } if (icol == 0) { //todo: this may not work rowsRead += chunkValues.Count; } } catch (Exception ex) { throw new ParquetException($"fatal error reading column '{path}'", ex); } } pos += rg.Num_rows; } Schema schema = footer.CreateModelSchema(_formatOptions); schema = schema.Filter(_fieldPredicates); var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by); Dictionary <string, string> customMetadata = footer.CustomMetadata; if (customMetadata != null) { ds.Metadata.Custom.AddRange(customMetadata); } ds.Thrift = _meta; return(ds); }