/// <summary> /// Create new SDF Schema from given SDF representation. /// </summary> /// <param name="schema">SDF representation of a schema.</param> public Schema(SDF schema) { var n = schema as Node; if (n == null || !n.Name.Equals("schema")) { throw new InvalidDataException("Schema must be a (schema) node."); } // built-ins _builtinTypes["node"] = new SchemaSimpleNodeType(); _builtinTypes["string"] = new SchemaStringType(); _builtinTypes["boolean"] = _builtinTypes["bool"] = new SchemaBooleanType(); _builtinTypes["number"] = new SchemaNumberType(); _builtinTypes["null"] = new SchemaNullType(); _topElement = MakeElement(n.Attributes["top-element"]); // read user-defined types foreach (var type in n.Children) { var t = MakeType(type); var nt = t as SchemaNodeType; if (nt != null) { _types[nt.Name] = nt; } else { var lt = t as SchemaLiteralType; if (lt != null) { _types[lt.Name] = lt; } else { throw new InvalidDataException("User-defined type must be either node-type or literal-type."); } } } // add built-ins foreach (var schemaBuiltinType in _builtinTypes) { _types[schemaBuiltinType.Key] = schemaBuiltinType.Value; } // verify all types have a description VerifyElement(_topElement); foreach (var schemaType in _types) { if (schemaType.Value is SchemaBuiltinType) { continue; } if (schemaType.Value is SchemaLiteralType) { continue; // can only reference a built-in } var t = schemaType.Value as SchemaNodeType; VerifyElement(t.Children); foreach (var attribute in t.Attributes) { VerifyElement(attribute.Element); } } ErrorMessage = null; }
private bool ValidateMatchesPartial(SchemaElement schemaElement, List <Match> input) { var n = schemaElement as SchemaNodeElement; if (n != null) { if (input.Count > 1) { ErrorMessage = "One node expected, multiple found."; return(false); } if (input.Count == 0) { return(true); // might be incomplete list yet } return(ValidateMatchesNodeElementPartial(n, input[0])); } var l = schemaElement as SchemaLiteralElement; if (l != null) { if (input.Count > 1) { ErrorMessage = "One literal expected, multiple found."; return(false); } if (input.Count == 0) { return(true); // might be incomplete list yet } return(ValidateMatchesLiteralElement(l, input[0])); // cannot match literal partially } var ls = schemaElement as SchemaListElement; if (ls != null) { return(ValidateMatchesListElementPartial(ls, input)); } var s = schemaElement as SchemaSequenceElement; if (s != null) { return(ValidateMatchesSequenceElementPartial(s, input)); } var o = schemaElement as SchemaOneOfElement; if (o != null) { return(ValidateMatchesOneOfElementPartial(o, input)); } ErrorMessage = "Unknown element type in ValidateMatches."; return(false); }
public Schema ParseSchema(ParquetOptions formatOptions) { void Build(SchemaElement node, ref int i, int count, bool isRoot) { while (node.Children.Count < count) { Thrift.SchemaElement tse = _fileMeta.Schema[i]; SchemaElement mse; if (tse.Converted_type == Thrift.ConvertedType.LIST) { Thrift.SchemaElement tseTop = tse; Thrift.SchemaElement tseList = _fileMeta.Schema[++i]; Thrift.SchemaElement tseElement = _fileMeta.Schema[++i]; mse = new SchemaElement(tseElement, isRoot ? null : node, formatOptions, tseElement.Num_children == 0 ? typeof(IEnumerable) //augmented to generic IEnumerable in constructor : typeof(IEnumerable <Row>), tseTop.Name); mse.Path = string.Join(Schema.PathSeparator, tseTop.Name, tseList.Name, tseElement.Name); mse.IsRepeated = true; if (!isRoot) { mse.Path = node.Path + Schema.PathSeparator + mse.Path; } mse.MaxDefinitionLevel = CountRepetitions(Thrift.FieldRepetitionType.OPTIONAL, mse, tseList); mse.MaxRepetitionLevel = CountRepetitions(Thrift.FieldRepetitionType.REPEATED, mse, tseList, tseTop); tse = tseElement; } else { Type containerType = tse.Num_children > 0 ? typeof(Row) : null; SchemaElement parent = isRoot ? null : node; mse = new SchemaElement(tse, parent, formatOptions, containerType); mse.MaxDefinitionLevel = CountRepetitions(Thrift.FieldRepetitionType.OPTIONAL, mse); mse.MaxRepetitionLevel = CountRepetitions(Thrift.FieldRepetitionType.REPEATED, mse); } node.Children.Add(mse); i += 1; if (tse.Num_children > 0) { Build(mse, ref i, tse.Num_children, false); } } } //extract schema tree var root = new SchemaElement <int>("root"); int start = 1; Build(root, ref start, _fileMeta.Schema[0].Num_children, true); return(new Schema(root.Children)); }
// making Schema elements, types and verifying those are OK private void VerifyElement(SchemaElement schemaElement) { if (schemaElement == null) { return; } var l = schemaElement as SchemaListElement; if (l != null) { VerifyElement(l.Element); return; } var o = schemaElement as SchemaOneOfElement; if (o != null) { foreach (var c in o.Options) { VerifyElement(c); } return; } var s = schemaElement as SchemaSequenceElement; if (s != null) { foreach (var c in s.Sequence) { VerifyElement(c); } return; } var lt = schemaElement as SchemaLiteralElement; if (lt != null) { if (!_types.ContainsKey(lt.TypeName)) { throw new InvalidDataException("Literal element references an undeclared type '" + lt.TypeName + "'"); } var t = _types[lt.TypeName]; if ((!(t is SchemaBuiltinType) || t is SchemaSimpleNodeType) && !(t is SchemaLiteralType)) { throw new InvalidDataException("Literal element references non-literal type."); } return; } var e = schemaElement as SchemaNodeElement; if (e == null) { throw new InvalidDataException("Unknown element type found in VerifyElement."); } if (!_types.ContainsKey(e.TypeName)) { throw new InvalidDataException("Node element references an undeclared type '" + e.TypeName + "'"); } var et = _types[e.TypeName]; if (!(et is SchemaSimpleNodeType) && !(et is SchemaNodeType)) { throw new InvalidDataException("Node element references non-node type."); } }
public abstract SchemaLiteral createSchemaLiteral(SchemaElement owner,object objectToWrap);
public abstract SchemaLiteral createSchemaLiteral(SchemaElement owner, object objectToWrap);
/// <summary> /// Test read, to be defined /// </summary> public DataSet Read() { _readerOptions.Validate(); _meta = ReadMetadata(); var metaParser = new FileMetadataParser(_meta); Schema schema = metaParser.ParseSchema(_formatOptions); if (schema.HasNestedElements) { throw new NotSupportedException("nested structures are not yet supported"); } var pathToValues = new Dictionary <string, IList>(); long pos = 0; long rowsRead = 0; foreach (Thrift.RowGroup rg in _meta.Row_groups) { //check whether to skip RG completely if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) || (_readerOptions.Offset > pos + rg.Num_rows - 1)) { pos += rg.Num_rows; continue; } long offset = Math.Max(0, _readerOptions.Offset - pos); long count = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows); for (int icol = 0; icol < rg.Columns.Count; icol++) { Thrift.ColumnChunk cc = rg.Columns[icol]; SchemaElement se = schema[cc]; var p = new PColumn(cc, se, _input, ThriftStream, _formatOptions); try { IList chunkValues = p.Read(offset, count); if (!pathToValues.TryGetValue(se.Path, out IList allValues)) { pathToValues[se.Path] = chunkValues; } else { allValues.AddRange(chunkValues); } if (icol == 0) { //todo: this may not work rowsRead += chunkValues.Count; } } catch (Exception ex) { throw new ParquetException($"fatal error reading column '{se}'", ex); } } pos += rg.Num_rows; } var merger = new RecursiveMerge(schema); DataSet ds = merger.Merge(pathToValues); ds.TotalRowCount = _meta.Num_rows; ds.Metadata.CreatedBy = _meta.Created_by; return(ds); }
private static void VerifyElement(SchemaElement element, String name) { Assert.That(element.Name, Is.EqualTo(name)); Assert.That(element.Namespace, Is.EqualTo("samples:schemas:simpleSchema")); }
public Schema ParseSchema(ParquetOptions formatOptions) { void Build(SchemaElement node, ref int i, int count, bool isRoot) { while (node.Children.Count < count) { Thrift.SchemaElement tse = _fileMeta.Schema[i]; SchemaElement mse; if (tse.Converted_type == Thrift.ConvertedType.LIST) { mse = BuildListSchema(ref tse, ref i, isRoot, node, formatOptions); } else if (tse.Converted_type == Thrift.ConvertedType.MAP || tse.Converted_type == Thrift.ConvertedType.MAP_KEY_VALUE) { mse = BuildMapSchema(ref tse, ref i, isRoot, node, formatOptions); } else { Type containerType = tse.Num_children > 0 ? typeof(Row) : null; SchemaElement parent = isRoot ? null : node; mse = BuildSchemaElement(tse, parent, formatOptions, containerType); AddFlags(mse, tse); } int mrl = mse.MaxRepetitionLevel; int mdl = mse.MaxDefinitionLevel; node.Children.Add(mse); mse.MaxRepetitionLevel = mrl; mse.MaxDefinitionLevel = mdl; i += 1; if (tse.Num_children > 0) { Build(mse, ref i, tse.Num_children, false); } } } //extract schema tree var root = new SchemaElement <int>("root") { Path = string.Empty }; root.AutoUpdateLevels = false; int start = 1; Build(root, ref start, _fileMeta.Schema[0].Num_children, true); foreach (SchemaElement se in root.Children) { se.Detach(); } root.AutoUpdateLevels = true; return(new Schema(root.Children)); }
private SchemaElement BuildListSchema(ref Thrift.SchemaElement tse, ref int i, bool isRoot, SchemaElement node, ParquetOptions formatOptions) { Thrift.SchemaElement tseTop = tse; Thrift.SchemaElement tseList = _fileMeta.Schema[++i]; Thrift.SchemaElement tseElement = _fileMeta.Schema[++i]; SchemaElement mse = BuildSchemaElement(tseElement, isRoot ? null : node, formatOptions, tseElement.Num_children == 0 ? typeof(IEnumerable) //augmented to generic IEnumerable in constructor : typeof(IEnumerable <Row>), tseTop.Name); mse.Path = string.Join(Schema.PathSeparator, tseTop.Name, tseList.Name, tseElement.Name); mse.IsRepeated = true; if (!isRoot) { mse.Path = node.Path + Schema.PathSeparator + mse.Path; } AddFlags(mse, tseTop, tseList, tseElement); tse = tseElement; return(mse); }
private SchemaElement BuildMapSchema(ref Thrift.SchemaElement tse, ref int i, bool isRoot, SchemaElement node, ParquetOptions formatOptions) { //tse is followed by map container (REPEATED) and another two elements - key and value Thrift.SchemaElement tseContainer = _fileMeta.Schema[++i]; Thrift.SchemaElement tseKey = _fileMeta.Schema[++i]; Thrift.SchemaElement tseValue = _fileMeta.Schema[++i]; Type keyType = TypePrimitive.GetSystemTypeBySchema(tseKey, formatOptions); Type valueType = TypePrimitive.GetSystemTypeBySchema(tseValue, formatOptions); Type gt = typeof(Dictionary <,>); Type masterType = gt.MakeGenericType(keyType, valueType); //master schema var se = new SchemaElement(tseContainer, tse.Name, masterType, masterType, string.Join(Schema.PathSeparator, tse.Name, tseContainer.Name)); if (!isRoot) { se.Path = node.Parent + Schema.PathSeparator + se.Path; } se.Parent = node; se.IsMap = true; AddFlags(se, tse, tseContainer); //extra schamas var kse = new SchemaElement(tseKey, null, keyType, keyType, null) { Parent = se }; var vse = new SchemaElement(tseValue, null, valueType, valueType, null) { Parent = se }; se.Extra.Add(kse); se.Extra.Add(vse); AddFlags(kse, tseKey); AddFlags(vse, tseValue); tse = tseValue; return(se); }
private List <PageTag> WriteValues(SchemaElement schema, IList values, Thrift.PageHeader ph, CompressionMethod compression) { var result = new List <PageTag>(); byte[] dictionaryPageBytes = null; int dictionaryPageCount = 0; byte[] dataPageBytes; //flatten values if the field is repeatable if (values != null && schema.IsRepeated) { values = FlattenRepeatables(values, schema); } using (var ms = new MemoryStream()) { using (var writer = new BinaryWriter(ms)) { //write repetitions if (schema.IsRepeated) { List <int> repetitions = CreateRepetitions(values, schema); _rleWriter.Write(writer, _definitionsSchema, repetitions, out IList nullExtra); } //write definitions if (schema.IsNullable || schema.IsRepeated) { CreateDefinitions(values, schema, out IList newValues, out List <int> definitions); values = newValues; _rleWriter.Write(writer, _definitionsSchema, definitions, out IList nullExtra); } //write data if (!_writerOptions.UseDictionaryEncoding || !_dicWriter.Write(writer, schema, values, out IList dicValues)) { _plainWriter.Write(writer, schema, values, out IList plainExtra); } else { dictionaryPageCount = dicValues.Count; ph.Data_page_header.Encoding = Thrift.Encoding.PLAIN_DICTIONARY; using (var dms = new MemoryStream()) using (var dwriter = new BinaryWriter(dms)) { _plainWriter.Write(dwriter, schema, dicValues, out IList t0); dictionaryPageBytes = dms.ToArray(); } } dataPageBytes = ms.ToArray(); } } if (dictionaryPageBytes != null) { Thrift.PageHeader dph = _meta.CreateDictionaryPage(dictionaryPageCount); dictionaryPageBytes = Compress(dph, dictionaryPageBytes, compression); int dictionaryHeaderSize = Write(dph, dictionaryPageBytes); result.Add(new PageTag { HeaderSize = dictionaryHeaderSize, HeaderMeta = dph }); } dataPageBytes = Compress(ph, dataPageBytes, compression); int dataHeaderSize = Write(ph, dataPageBytes); result.Add(new PageTag { HeaderSize = dataHeaderSize, HeaderMeta = ph }); return(result); }
public abstract SchemaAssociation createSchemaAssociation(SchemaElement owner, object objectToWrap);
public ValueMerger(SchemaElement schema, ParquetOptions formatOptions, IList values) { _schema = schema; _formatOptions = formatOptions; _values = values; }
public override byte[] PlainEncode(SchemaElement tse, bool x) => null;
// ------------------------------------------ // ACCESSORS // ------------------------------------------ #region Accessors /// <summary> /// Gets the schema element with the specified ID. /// </summary> /// <param name="id">The ID of the meta object to consider.</param> /// <param name="parentMetobject1">The parent meta object to consider.</param> /// <returns>The bmeta object with the specified ID.</returns> public SchemaElement GetElementWithId(String id, SchemaElement parentMetobject1 = null) { return(RootZone?.GetElementWithId(id)); }
public override object PlainDecode(SchemaElement tse, byte[] encoded) => null;
public abstract SchemaAssociation createSchemaAssociation(SchemaElement owner,object objectToWrap);
protected override SchemaElement CreateSimple(SchemaElement parent, Thrift.SchemaElement tse) { return(new SchemaElement(tse.Name, DataType.Float, parent)); }
public abstract SchemaProperty createSchemaProperty(SchemaElement owner,object objectToWrap);
public abstract SchemaProperty createSchemaProperty(SchemaElement owner, object objectToWrap);