protected override byte[] ReadSingle(BinaryReader reader, Thrift.SchemaElement tse, int length) { //length if (length == -1) { length = reader.ReadInt32(); } //data return(reader.ReadBytes(length)); }
public SchemaElement ParseSchemaExperimental(ParquetOptions formatOptions) { int si = 0; Thrift.SchemaElement tse = _fileMeta.Schema[si++]; var root = new SchemaElement(tse.Name, DataType.Unspecified, null); ParseSchemaExperimenal(root, tse.Num_children, ref si, formatOptions); return(root); }
public Schema CreateModelSchema(ParquetOptions formatOptions) { int si = 0; Thrift.SchemaElement tse = _fileMeta.Schema[si++]; var container = new List <Field>(); CreateModelSchema(null, container, tse.Num_children, ref si, formatOptions); return(new Schema(container)); }
private void CreateThriftSchema(IEnumerable <Field> ses, Thrift.SchemaElement parent, IList <Thrift.SchemaElement> container) { foreach (Field se in ses) { IDataTypeHandler handler = DataTypeFactory.Match(se); //todo: check that handler is found indeed handler.CreateThrift(se, parent, container); } }
private void ReadAsInt64(Thrift.SchemaElement tse, BinaryReader reader, IList result) { decimal scaleFactor = (decimal)Math.Pow(10, -tse.Scale); while (reader.BaseStream.Position + 8 <= reader.BaseStream.Length) { long lv = reader.ReadInt64(); decimal dv = lv * scaleFactor; result.Add(dv); } }
private void WriteAsFixedLengthByteArray(Thrift.SchemaElement tse, BinaryWriter writer, IList values) { foreach (decimal d in values) { var bd = new BigDecimal(d, tse.Precision, tse.Scale); byte[] itemData = bd.ToByteArray(); tse.Type_length = itemData.Length; //always re-set type length as it can differ from default type length writer.Write(itemData); } }
public override bool IsMatch(Thrift.SchemaElement tse, ParquetOptions formatOptions) { return ((tse.Type == Thrift.Type.INT96 && formatOptions.TreatBigIntegersAsDates) || //Impala (tse.Type == Thrift.Type.INT64 && tse.__isset.converted_type && tse.Converted_type == Thrift.ConvertedType.TIMESTAMP_MILLIS) || (tse.Type == Thrift.Type.INT64 && tse.__isset.converted_type && tse.Converted_type == Thrift.ConvertedType.TIMESTAMP_MICROS) || (tse.Type == Thrift.Type.INT32 && tse.__isset.converted_type && tse.Converted_type == Thrift.ConvertedType.DATE)); }
public override byte[] PlainEncode(Thrift.SchemaElement tse, string x) { using (var ms = new MemoryStream()) { using (var bs = new BinaryWriter(ms)) { WriteOne(bs, x, false); } return(ms.ToArray()); } }
public override Field CreateSchemaElement(IList <Thrift.SchemaElement> schema, ref int index, out int ownedChildCount) { Thrift.SchemaElement tseList = schema[index]; ListField listField = ListField.CreateWithNoItem(tseList.Name); //as we are skipping elements set path hint listField.Path = $"{tseList.Name}{Schema.PathSeparator}{schema[index + 1].Name}"; index += 2; //skip this element and child container ownedChildCount = 1; //we should get this element assigned back return(listField); }
public virtual Field CreateSchemaElement(IList <Thrift.SchemaElement> schema, ref int index, out int ownedChildCount) { Thrift.SchemaElement tse = schema[index++]; bool hasNulls = (tse.Repetition_type != Thrift.FieldRepetitionType.REQUIRED); bool isArray = (tse.Repetition_type == Thrift.FieldRepetitionType.REPEATED); Field simple = CreateSimple(tse, hasNulls, isArray); ownedChildCount = 0; return(simple); }
private void ThrowNoHandler(Thrift.SchemaElement tse) { string ct = tse.__isset.converted_type ? $" ({tse.Converted_type})" : null; string t = tse.__isset.type ? $"'{tse.Type}'" : "<unspecified>"; throw new NotSupportedException($"cannot find data type handler for schema element '{tse.Name}' (type: {t}{ct})"); }
public override bool IsMatch(Thrift.SchemaElement tse, ParquetOptions formatOptions) { return (tse.__isset.converted_type && tse.Converted_type == Thrift.ConvertedType.DECIMAL && ( tse.Type == Thrift.Type.FIXED_LEN_BYTE_ARRAY || tse.Type == Thrift.Type.INT32 || tse.Type == Thrift.Type.INT64 )); }
public Thrift.FileMetaData CreateThriftSchema(Schema schema) { var meta = new Thrift.FileMetaData(); meta.Version = 1; meta.Schema = new List <Thrift.SchemaElement>(); Thrift.SchemaElement root = AddRoot(meta.Schema); CreateThriftSchema(schema.Fields, root, meta.Schema); return(meta); }
private int ReadAsInt64(Thrift.SchemaElement tse, BinaryReader reader, decimal[] dest, int offset) { int start = offset; decimal scaleFactor = (decimal)Math.Pow(10, -tse.Scale); while (reader.BaseStream.Position + 8 <= reader.BaseStream.Length) { long lv = reader.ReadInt64(); decimal dv = lv * scaleFactor; dest[offset++] = dv; } return(offset - start); }
public BigDecimal(byte[] data, Thrift.SchemaElement schema) { data = data.Reverse().ToArray(); UnscaledValue = new BigInteger(data); Precision = schema.Precision; Scale = schema.Scale; BigInteger scaleMultiplier = BigInteger.Pow(10, Scale); decimal ipScaled = (decimal)BigInteger.DivRem(UnscaledValue, scaleMultiplier, out BigInteger fpUnscaled); decimal fpScaled = (decimal)fpUnscaled / (decimal)scaleMultiplier; DecimalValue = ipScaled + fpScaled; }
public ColumnarReader(Stream inputStream, Thrift.ColumnChunk thriftColumnChunk, ThriftFooter footer, ParquetOptions parquetOptions) { _inputStream = inputStream ?? throw new ArgumentNullException(nameof(inputStream)); _thriftColumnChunk = thriftColumnChunk ?? throw new ArgumentNullException(nameof(thriftColumnChunk)); _footer = footer ?? throw new ArgumentNullException(nameof(footer)); _parquetOptions = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions)); _thriftStream = new ThriftStream(inputStream); _footer.GetLevels(_thriftColumnChunk, out int mrl, out int mdl); _maxRepetitionLevel = mrl; _maxDefinitionLevel = mdl; _thriftSchemaElement = _footer.GetSchemaElement(_thriftColumnChunk); _dataTypeHandler = DataTypeFactory.Match(_thriftSchemaElement, _parquetOptions); }
private int Read(Thrift.SchemaElement tse, BinaryReader reader, TSystemType[] dest, int offset) { int totalLength = (int)reader.BaseStream.Length; int idx = offset; Stream s = reader.BaseStream; while (s.Position < totalLength && idx < dest.Length) { TSystemType element = ReadSingle(reader, tse, -1); //potential performance hit on calling a method dest[idx++] = element; } return(idx - offset); }
public List <string> GetPath(Thrift.SchemaElement schemaElement) { var path = new List <string>(); ThriftSchemaTree.Node wrapped = _tree.Find(schemaElement); while (wrapped.parent != null) { path.Add(wrapped.element.Name); wrapped = wrapped.parent; } path.Reverse(); return(path); }
public override int Read(BinaryReader reader, Thrift.SchemaElement tse, Array dest, int offset) { switch (tse.Type) { case Thrift.Type.INT32: return(ReadAsInt32(reader, (TimeSpan[])dest, offset)); case Thrift.Type.INT64: return(ReadAsInt64(reader, (TimeSpan[])dest, offset)); default: throw new NotSupportedException(); } }
private int Read(Thrift.SchemaElement tse, BinaryReader reader, ParquetOptions formatOptions, TSystemType[] dest, int offset) { int totalLength = (int)reader.BaseStream.Length; int idx = offset; Stream s = reader.BaseStream; while (s.Position < totalLength && idx < dest.Length) { TSystemType element = ReadOne(reader); dest[idx++] = element; } return(idx - offset); }
public void GetLevels(Thrift.ColumnChunk columnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel) { maxRepetitionLevel = 0; maxDefinitionLevel = 0; int i = 0; List <string> path = columnChunk.Meta_data.Path_in_schema; var comparer = new StringListComparer(path); if (_memoizedLevels.TryGetValue(comparer, out Tuple <int, int> t)) { maxRepetitionLevel = t.Item1; maxDefinitionLevel = t.Item2; return; } int fieldCount = _fileMeta.Schema.Count; foreach (string pp in path) { while (i < fieldCount) { SchemaElement schemaElement = _fileMeta.Schema[i]; if (string.CompareOrdinal(schemaElement.Name, pp) == 0) { Thrift.SchemaElement se = schemaElement; bool repeated = (se.__isset.repetition_type && se.Repetition_type == Thrift.FieldRepetitionType.REPEATED); bool defined = (se.Repetition_type == Thrift.FieldRepetitionType.REQUIRED); if (repeated) { maxRepetitionLevel += 1; } if (!defined) { maxDefinitionLevel += 1; } break; } i++; } } _memoizedLevels.Add(comparer, Tuple.Create(maxRepetitionLevel, maxDefinitionLevel)); }
internal Thrift.Statistics ToThriftStatistics(IDataTypeHandler handler, Thrift.SchemaElement tse) { byte[] min = handler.PlainEncode(tse, MinValue); byte[] max = handler.PlainEncode(tse, MaxValue); return(new Thrift.Statistics { Null_count = NullCount, Distinct_count = DistinctCount, Min = min, Min_value = min, Max = max, Max_value = max }); }
public PrimitiveReader( Thrift.SchemaElement schemaElement, ParquetOptions parquetOptions, IDataTypeHandler dataTypeHandler, BinaryReader binaryReader, int typeWidth, Func <BinaryReader, TElement> readOneFunc) { _schemaElement = schemaElement; _parquetOptions = parquetOptions; _dataTypeHandler = dataTypeHandler; _binaryReader = binaryReader; _typeWidth = typeWidth; _readOneFunc = readOneFunc; }
public DataColumnWriter( Stream stream, ThriftStream thriftStream, ThriftFooter footer, Thrift.SchemaElement schemaElement, CompressionMethod compressionMethod, int rowCount) { _stream = stream; _thriftStream = thriftStream; _footer = footer; _schemaElement = schemaElement; _compressionMethod = compressionMethod; _rowCount = rowCount; }
public static void AdjustSchema(Thrift.SchemaElement schema, Type systemType) { if (!TypeToTag.TryGetValue(systemType, out TypeTag tag)) { string supportedTypes = string.Join(", ", TypeToTag.Keys.Select(t => t.ToString())); throw new NotSupportedException($"system type {systemType} is not supported, list of supported types: '{supportedTypes}'"); } schema.Type = tag.PType; if (tag.ConvertedType != null) { schema.Converted_type = tag.ConvertedType.Value; } }
private Thrift.ColumnChunk WriteColumnChunk(Thrift.SchemaElement tse, List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler) { Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, tse.Type, path, 0); Thrift.PageHeader ph = _footer.CreateDataPage(_rowCount); _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); List <PageTag> pages = WriteColumn(column, tse, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel); chunk.Meta_data.Num_values = ph.Data_page_header.Num_values; //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
public virtual void CreateThrift(Field se, Thrift.SchemaElement parent, IList <Thrift.SchemaElement> container) { DataField sef = (DataField)se; var tse = new Thrift.SchemaElement(se.Name); tse.Type = _thriftType; if (_convertedType != null) { tse.Converted_type = _convertedType.Value; } tse.Repetition_type = sef.IsArray ? Thrift.FieldRepetitionType.REPEATED : (sef.HasNulls ? Thrift.FieldRepetitionType.OPTIONAL : Thrift.FieldRepetitionType.REQUIRED); container.Add(tse); parent.Num_children += 1; }
private void BuildSchema(Node parent, List <Thrift.SchemaElement> schema, int count, ref int i) { parent.children = new List <Node>(); for (int ic = 0; ic < count; ic++) { Thrift.SchemaElement child = schema[i++]; var node = new Node { element = child, parent = parent }; parent.children.Add(node); if (child.Num_children > 0) { BuildSchema(node, schema, child.Num_children, ref i); } } }
public override object PlainDecode(Thrift.SchemaElement tse, byte[] encoded) { if (encoded == null) { return(null); } using (var ms = new MemoryStream(encoded)) { using (var br = new BinaryReader(ms)) { string element = ReadSingle(br, null, -1, false); return(element); } } }
public override int Read(BinaryReader reader, Thrift.SchemaElement tse, Array dest, int offset, ParquetOptions formatOptions) { string[] tdest = (string[])dest; int totalLength = (int)reader.BaseStream.Length; int idx = offset; Stream s = reader.BaseStream; while (s.Position < totalLength) { string element = ReadOne(reader); tdest[idx++] = element; } return(idx - offset); }