internal SchemaElement this[Thrift.ColumnChunk value] { get { return(_pathToElement[value.Meta_data.Path_in_schema[0]]); } }
public void GetLevels(Thrift.ColumnChunk columnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel) { maxRepetitionLevel = 0; maxDefinitionLevel = 0; int i = 0; List <string> path = columnChunk.Meta_data.Path_in_schema; foreach (string pp in path) { while (i < _fileMeta.Schema.Count) { if (_fileMeta.Schema[i].Name == pp) { Thrift.SchemaElement se = _fileMeta.Schema[i]; bool repeated = (se.__isset.repetition_type && se.Repetition_type == Thrift.FieldRepetitionType.REPEATED); bool defined = (se.Repetition_type == Thrift.FieldRepetitionType.REQUIRED); if (repeated) { maxRepetitionLevel += 1; } if (!defined) { maxDefinitionLevel += 1; } break; } i++; } } }
public Thrift.SchemaElement GetSchemaElement(Thrift.ColumnChunk columnChunk) { if (columnChunk == null) { throw new ArgumentNullException(nameof(columnChunk)); } List <string> path = columnChunk.Meta_data.Path_in_schema; int i = 0; foreach (string pp in path) { while (i < _fileMeta.Schema.Count) { if (_fileMeta.Schema[i].Name == pp) { break; } i++; } } return(_fileMeta.Schema[i]); }
public ColumnarReader(Stream inputStream, Thrift.ColumnChunk thriftColumnChunk, ThriftFooter footer, ParquetOptions parquetOptions) { _inputStream = inputStream ?? throw new ArgumentNullException(nameof(inputStream)); _thriftColumnChunk = thriftColumnChunk ?? throw new ArgumentNullException(nameof(thriftColumnChunk)); _footer = footer ?? throw new ArgumentNullException(nameof(footer)); _parquetOptions = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions)); _thriftStream = new ThriftStream(inputStream); }
public PColumn(Thrift.ColumnChunk thriftChunk, SchemaElement schema, Stream inputStream, ThriftStream thriftStream, ParquetOptions options) { _thriftChunk = thriftChunk; _thrift = thriftStream; _schema = schema; _inputStream = inputStream; _options = options; _plainReader = new PlainValuesReader(options); }
public ColumnarReader(Stream inputStream, Thrift.ColumnChunk thriftColumnChunk, ThriftFooter footer, ParquetOptions parquetOptions) { _inputStream = inputStream ?? throw new ArgumentNullException(nameof(inputStream)); _thriftColumnChunk = thriftColumnChunk ?? throw new ArgumentNullException(nameof(thriftColumnChunk)); _footer = footer ?? throw new ArgumentNullException(nameof(footer)); _parquetOptions = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions)); _thriftStream = new ThriftStream(inputStream); _footer.GetLevels(_thriftColumnChunk, out int mrl, out int mdl); _maxRepetitionLevel = mrl; _maxDefinitionLevel = mdl; _thriftSchemaElement = _footer.GetSchemaElement(_thriftColumnChunk); _dataTypeHandler = DataTypeFactory.Match(_thriftSchemaElement, _parquetOptions); }
public void GetLevels(Thrift.ColumnChunk columnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel) { maxRepetitionLevel = 0; maxDefinitionLevel = 0; int i = 0; List <string> path = columnChunk.Meta_data.Path_in_schema; var comparer = new StringListComparer(path); if (_memoizedLevels.TryGetValue(comparer, out Tuple <int, int> t)) { maxRepetitionLevel = t.Item1; maxDefinitionLevel = t.Item2; return; } int fieldCount = _fileMeta.Schema.Count; foreach (string pp in path) { while (i < fieldCount) { SchemaElement schemaElement = _fileMeta.Schema[i]; if (string.CompareOrdinal(schemaElement.Name, pp) == 0) { Thrift.SchemaElement se = schemaElement; bool repeated = (se.__isset.repetition_type && se.Repetition_type == Thrift.FieldRepetitionType.REPEATED); bool defined = (se.Repetition_type == Thrift.FieldRepetitionType.REQUIRED); if (repeated) { maxRepetitionLevel += 1; } if (!defined) { maxDefinitionLevel += 1; } break; } i++; } } _memoizedLevels.Add(comparer, Tuple.Create(maxRepetitionLevel, maxDefinitionLevel)); }
internal int GetMaxDefinitionLevel(Thrift.ColumnChunk cc) { int max = 0; foreach (string part in cc.Meta_data.Path_in_schema) { SchemaElement element = _pathToElement[part]; if (element.Thrift.Repetition_type != Thrift.FieldRepetitionType.REQUIRED) { max += 1; } } return(max); }
public PColumn(Thrift.ColumnChunk thriftChunk, Schema schema, Stream inputStream, ThriftStream thriftStream, ParquetOptions options) { if (thriftChunk.Meta_data.Path_in_schema.Count != 1) { throw new NotImplementedException("path in scheme is not flat"); } _thriftChunk = thriftChunk; _thrift = thriftStream; _schema = schema; _inputStream = inputStream; _schemaElement = _schema[_thriftChunk]; _options = options; _plainReader = new PlainValuesReader(options); }
private Thrift.ColumnChunk WriteColumnChunk(Thrift.SchemaElement tse, List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler) { Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, tse.Type, path, 0); Thrift.PageHeader ph = _footer.CreateDataPage(_rowCount); _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); List <PageTag> pages = WriteColumn(column, tse, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel); chunk.Meta_data.Num_values = ph.Data_page_header.Num_values; //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
public void Write(DataColumn column) { if (column == null) { throw new ArgumentNullException(nameof(column)); } Thrift.SchemaElement tse = _thschema[_colIdx++]; IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions); //todo: check if the column is in the right order List <string> path = _footer.GetPath(tse); Thrift.ColumnChunk chunk = WriteColumnChunk(tse, path, column, dataTypeHandler); _thriftRowGroup.Columns.Add(chunk); }
public Thrift.ColumnChunk Write(int offset, int count, IList values) { if (values == null) { values = TypeFactory.Create(_schema.ElementType, _schema.IsNullable, _schema.IsRepeated); } Thrift.ColumnChunk chunk = _meta.AddColumnChunk(_compressionMethod, _output, _schema, values.Count); Thrift.PageHeader ph = _meta.CreateDataPage(values.Count); List <PageTag> pages = WriteValues(_schema, values, ph, _compressionMethod); //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
public Thrift.ColumnChunk Write(List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler) { Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, _schemaElement.Type, path, 0); Thrift.PageHeader ph = _footer.CreateDataPage(column.TotalCount); _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); List <PageTag> pages = WriteColumn(column, _schemaElement, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel); //this count must be set to number of all values in the column, including nulls. //for hierarchy/repeated columns this is a count of flattened list, including nulls. chunk.Meta_data.Num_values = ph.Data_page_header.Num_values; //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
internal SchemaElement this[Thrift.ColumnChunk value] { get { string path = string.Join(PathSeparator, value.Meta_data.Path_in_schema); if (_pathToElement == null) { BuildPathCache(); } if (!_pathToElement.TryGetValue(path, out SchemaElement result)) { throw new ArgumentException($"cannot find schema element by path '{path}'", nameof(value)); } return(result); } }
private void PrintStatistics(Thrift.FileMetaData fileMeta, Thrift.ColumnChunk column, Thrift.Statistics stats) { WriteLine(" Statistics", T.HeadingTextColor); if (stats == null || !(stats.__isset.null_count || stats.__isset.distinct_count || stats.__isset.min || stats.__isset.max)) { WriteLine(" none defined", T.ErrorTextColor); return; } const string undefined = "undefined"; var t = new Table("name", "value"); t.AddRow("Null Count", stats.__isset.null_count ? stats.Null_count.ToString() : undefined); t.AddRow("Distinct Count", stats.__isset.distinct_count ? stats.Distinct_count.ToString() : undefined); t.AddRow("Min", stats.__isset.min ? fileMeta.DecodeSingleStatsValue(column, stats.Min) : undefined); t.AddRow("Max", stats.__isset.max ? fileMeta.DecodeSingleStatsValue(column, stats.Max) : undefined); t.Render(false, 6, T.HeadingTextColor, T.NormalTextColor); }
public ColumnarWriter(Stream output, ThriftStream thriftStream, ThriftFooter footer, Thrift.SchemaElement tse, List <string> path, CompressionMethod compressionMethod, ParquetOptions formatOptions, WriterOptions writerOptions) { _output = output; _thriftStream = thriftStream; _footer = footer; _tse = tse; _compressionMethod = compressionMethod; _formatOptions = formatOptions; _writerOptions = writerOptions; _dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions); _chunk = _footer.CreateColumnChunk(_compressionMethod, _output, _tse.Type, path, 0); _ph = _footer.CreateDataPage(0); _footer.GetLevels(_chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); _maxRepetitionLevel = maxRepetitionLevel; _maxDefinitionLevel = maxDefinitionLevel; }
public Thrift.ColumnChunk AddColumnChunk(CompressionMethod compression, Stream output, SchemaElement schema, int valuesCount) { Thrift.CompressionCodec codec = DataFactory.GetThriftCompression(compression); var chunk = new Thrift.ColumnChunk(); long startPos = output.Position; chunk.File_offset = startPos; chunk.Meta_data = new Thrift.ColumnMetaData(); chunk.Meta_data.Num_values = valuesCount; chunk.Meta_data.Type = schema.Thrift.Type; chunk.Meta_data.Codec = codec; chunk.Meta_data.Data_page_offset = startPos; chunk.Meta_data.Encodings = new List <Thrift.Encoding> { Thrift.Encoding.RLE, Thrift.Encoding.BIT_PACKED, Thrift.Encoding.PLAIN }; chunk.Meta_data.Path_in_schema = new List <string>(schema.Path.Split(Schema.PathSeparatorChar)); return(chunk); }
public Thrift.ColumnChunk CreateColumnChunk(CompressionMethod compression, Stream output, Thrift.Type columnType, List <string> path, int valuesCount) { Thrift.CompressionCodec codec = DataFactory.GetThriftCompression(compression); var chunk = new Thrift.ColumnChunk(); long startPos = output.Position; chunk.File_offset = startPos; chunk.Meta_data = new Thrift.ColumnMetaData(); chunk.Meta_data.Num_values = valuesCount; chunk.Meta_data.Type = columnType; chunk.Meta_data.Codec = codec; chunk.Meta_data.Data_page_offset = startPos; chunk.Meta_data.Encodings = new List <Thrift.Encoding> { Thrift.Encoding.RLE, Thrift.Encoding.BIT_PACKED, Thrift.Encoding.PLAIN }; chunk.Meta_data.Path_in_schema = path; return(chunk); }
public abstract bool IsMatch(Thrift.ColumnChunk columnChunk, string path);
public override bool IsMatch(Thrift.ColumnChunk columnChunk, string path) { return(path == _columnName || path.StartsWith(_prefix)); }
public override bool IsMatch(Thrift.ColumnChunk columnChunk, string path) { return(path == _columnName || path.StartsWith(_prefix, System.StringComparison.CurrentCultureIgnoreCase)); }