Exemple #1
0
        /// <summary>
        ///     Create new SDF Schema from given SDF representation.
        /// </summary>
        /// <param name="schema">SDF representation of a schema.</param>
        public Schema(SDF schema)
        {
            var n = schema as Node;

            if (n == null || !n.Name.Equals("schema"))
            {
                throw new InvalidDataException("Schema must be a (schema) node.");
            }

            // built-ins
            _builtinTypes["node"]    = new SchemaSimpleNodeType();
            _builtinTypes["string"]  = new SchemaStringType();
            _builtinTypes["boolean"] = _builtinTypes["bool"] = new SchemaBooleanType();
            _builtinTypes["number"]  = new SchemaNumberType();
            _builtinTypes["null"]    = new SchemaNullType();

            _topElement = MakeElement(n.Attributes["top-element"]);

            // read user-defined types
            foreach (var type in n.Children)
            {
                var t  = MakeType(type);
                var nt = t as SchemaNodeType;
                if (nt != null)
                {
                    _types[nt.Name] = nt;
                }
                else
                {
                    var lt = t as SchemaLiteralType;
                    if (lt != null)
                    {
                        _types[lt.Name] = lt;
                    }
                    else
                    {
                        throw new InvalidDataException("User-defined type must be either node-type or literal-type.");
                    }
                }
            }

            // add built-ins
            foreach (var schemaBuiltinType in _builtinTypes)
            {
                _types[schemaBuiltinType.Key] = schemaBuiltinType.Value;
            }

            // verify all types have a description
            VerifyElement(_topElement);
            foreach (var schemaType in _types)
            {
                if (schemaType.Value is SchemaBuiltinType)
                {
                    continue;
                }

                if (schemaType.Value is SchemaLiteralType)
                {
                    continue;                     // can only reference a built-in
                }

                var t = schemaType.Value as SchemaNodeType;
                VerifyElement(t.Children);
                foreach (var attribute in t.Attributes)
                {
                    VerifyElement(attribute.Element);
                }
            }

            ErrorMessage = null;
        }
Exemple #2
0
        private bool ValidateMatchesPartial(SchemaElement schemaElement, List <Match> input)
        {
            var n = schemaElement as SchemaNodeElement;

            if (n != null)
            {
                if (input.Count > 1)
                {
                    ErrorMessage = "One node expected, multiple found.";
                    return(false);
                }

                if (input.Count == 0)
                {
                    return(true);                    // might be incomplete list yet
                }

                return(ValidateMatchesNodeElementPartial(n, input[0]));
            }

            var l = schemaElement as SchemaLiteralElement;

            if (l != null)
            {
                if (input.Count > 1)
                {
                    ErrorMessage = "One literal expected, multiple found.";
                    return(false);
                }

                if (input.Count == 0)
                {
                    return(true);                    // might be incomplete list yet
                }

                return(ValidateMatchesLiteralElement(l, input[0]));                // cannot match literal partially
            }

            var ls = schemaElement as SchemaListElement;

            if (ls != null)
            {
                return(ValidateMatchesListElementPartial(ls, input));
            }

            var s = schemaElement as SchemaSequenceElement;

            if (s != null)
            {
                return(ValidateMatchesSequenceElementPartial(s, input));
            }

            var o = schemaElement as SchemaOneOfElement;

            if (o != null)
            {
                return(ValidateMatchesOneOfElementPartial(o, input));
            }

            ErrorMessage = "Unknown element type in ValidateMatches.";
            return(false);
        }
Exemple #3
0
        public Schema ParseSchema(ParquetOptions formatOptions)
        {
            void Build(SchemaElement node, ref int i, int count, bool isRoot)
            {
                while (node.Children.Count < count)
                {
                    Thrift.SchemaElement tse = _fileMeta.Schema[i];
                    SchemaElement        mse;

                    if (tse.Converted_type == Thrift.ConvertedType.LIST)
                    {
                        Thrift.SchemaElement tseTop     = tse;
                        Thrift.SchemaElement tseList    = _fileMeta.Schema[++i];
                        Thrift.SchemaElement tseElement = _fileMeta.Schema[++i];

                        mse = new SchemaElement(tseElement,
                                                isRoot ? null : node,
                                                formatOptions,
                                                tseElement.Num_children == 0
                     ? typeof(IEnumerable)   //augmented to generic IEnumerable in constructor
                     : typeof(IEnumerable <Row>),
                                                tseTop.Name);
                        mse.Path       = string.Join(Schema.PathSeparator, tseTop.Name, tseList.Name, tseElement.Name);
                        mse.IsRepeated = true;
                        if (!isRoot)
                        {
                            mse.Path = node.Path + Schema.PathSeparator + mse.Path;
                        }

                        mse.MaxDefinitionLevel = CountRepetitions(Thrift.FieldRepetitionType.OPTIONAL, mse, tseList);
                        mse.MaxRepetitionLevel = CountRepetitions(Thrift.FieldRepetitionType.REPEATED, mse, tseList, tseTop);

                        tse = tseElement;
                    }
                    else
                    {
                        Type containerType = tse.Num_children > 0
                     ? typeof(Row)
                     : null;

                        SchemaElement parent = isRoot ? null : node;
                        mse = new SchemaElement(tse, parent, formatOptions, containerType);
                        mse.MaxDefinitionLevel = CountRepetitions(Thrift.FieldRepetitionType.OPTIONAL, mse);
                        mse.MaxRepetitionLevel = CountRepetitions(Thrift.FieldRepetitionType.REPEATED, mse);
                    }

                    node.Children.Add(mse);

                    i += 1;

                    if (tse.Num_children > 0)
                    {
                        Build(mse, ref i, tse.Num_children, false);
                    }
                }
            }

            //extract schema tree
            var root  = new SchemaElement <int>("root");
            int start = 1;

            Build(root, ref start, _fileMeta.Schema[0].Num_children, true);

            return(new Schema(root.Children));
        }
Exemple #4
0
        // making Schema elements, types and verifying those are OK

        private void VerifyElement(SchemaElement schemaElement)
        {
            if (schemaElement == null)
            {
                return;
            }

            var l = schemaElement as SchemaListElement;

            if (l != null)
            {
                VerifyElement(l.Element);
                return;
            }

            var o = schemaElement as SchemaOneOfElement;

            if (o != null)
            {
                foreach (var c in o.Options)
                {
                    VerifyElement(c);
                }

                return;
            }

            var s = schemaElement as SchemaSequenceElement;

            if (s != null)
            {
                foreach (var c in s.Sequence)
                {
                    VerifyElement(c);
                }

                return;
            }

            var lt = schemaElement as SchemaLiteralElement;

            if (lt != null)
            {
                if (!_types.ContainsKey(lt.TypeName))
                {
                    throw new InvalidDataException("Literal element references an undeclared type '" + lt.TypeName + "'");
                }

                var t = _types[lt.TypeName];
                if ((!(t is SchemaBuiltinType) || t is SchemaSimpleNodeType) && !(t is SchemaLiteralType))
                {
                    throw new InvalidDataException("Literal element references non-literal type.");
                }

                return;
            }

            var e = schemaElement as SchemaNodeElement;

            if (e == null)
            {
                throw new InvalidDataException("Unknown element type found in VerifyElement.");
            }

            if (!_types.ContainsKey(e.TypeName))
            {
                throw new InvalidDataException("Node element references an undeclared type '" + e.TypeName + "'");
            }

            var et = _types[e.TypeName];

            if (!(et is SchemaSimpleNodeType) && !(et is SchemaNodeType))
            {
                throw new InvalidDataException("Node element references non-node type.");
            }
        }
 public abstract SchemaLiteral createSchemaLiteral(SchemaElement owner,object objectToWrap);
Exemple #6
0
 public abstract SchemaLiteral createSchemaLiteral(SchemaElement owner, object objectToWrap);
Exemple #7
0
        /// <summary>
        /// Test read, to be defined
        /// </summary>
        public DataSet Read()
        {
            _readerOptions.Validate();

            _meta = ReadMetadata();

            var    metaParser = new FileMetadataParser(_meta);
            Schema schema     = metaParser.ParseSchema(_formatOptions);

            if (schema.HasNestedElements)
            {
                throw new NotSupportedException("nested structures are not yet supported");
            }

            var  pathToValues = new Dictionary <string, IList>();
            long pos          = 0;
            long rowsRead     = 0;

            foreach (Thrift.RowGroup rg in _meta.Row_groups)
            {
                //check whether to skip RG completely
                if ((_readerOptions.Count != -1 && rowsRead >= _readerOptions.Count) ||
                    (_readerOptions.Offset > pos + rg.Num_rows - 1))
                {
                    pos += rg.Num_rows;
                    continue;
                }

                long offset = Math.Max(0, _readerOptions.Offset - pos);
                long count  = _readerOptions.Count == -1 ? rg.Num_rows : Math.Min(_readerOptions.Count - rowsRead, rg.Num_rows);

                for (int icol = 0; icol < rg.Columns.Count; icol++)
                {
                    Thrift.ColumnChunk cc = rg.Columns[icol];
                    SchemaElement      se = schema[cc];

                    var p = new PColumn(cc, se, _input, ThriftStream, _formatOptions);

                    try
                    {
                        IList chunkValues = p.Read(offset, count);

                        if (!pathToValues.TryGetValue(se.Path, out IList allValues))
                        {
                            pathToValues[se.Path] = chunkValues;
                        }
                        else
                        {
                            allValues.AddRange(chunkValues);
                        }

                        if (icol == 0)
                        {
                            //todo: this may not work
                            rowsRead += chunkValues.Count;
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ParquetException($"fatal error reading column '{se}'", ex);
                    }
                }

                pos += rg.Num_rows;
            }

            var     merger = new RecursiveMerge(schema);
            DataSet ds     = merger.Merge(pathToValues);

            ds.TotalRowCount      = _meta.Num_rows;
            ds.Metadata.CreatedBy = _meta.Created_by;

            return(ds);
        }
 private static void VerifyElement(SchemaElement element, String name)
 {
     Assert.That(element.Name, Is.EqualTo(name));
     Assert.That(element.Namespace, Is.EqualTo("samples:schemas:simpleSchema"));
 }
Exemple #9
0
        public Schema ParseSchema(ParquetOptions formatOptions)
        {
            void Build(SchemaElement node, ref int i, int count, bool isRoot)
            {
                while (node.Children.Count < count)
                {
                    Thrift.SchemaElement tse = _fileMeta.Schema[i];
                    SchemaElement        mse;

                    if (tse.Converted_type == Thrift.ConvertedType.LIST)
                    {
                        mse = BuildListSchema(ref tse, ref i, isRoot, node, formatOptions);
                    }
                    else if (tse.Converted_type == Thrift.ConvertedType.MAP || tse.Converted_type == Thrift.ConvertedType.MAP_KEY_VALUE)
                    {
                        mse = BuildMapSchema(ref tse, ref i, isRoot, node, formatOptions);
                    }
                    else
                    {
                        Type containerType = tse.Num_children > 0
                     ? typeof(Row)
                     : null;

                        SchemaElement parent = isRoot ? null : node;
                        mse = BuildSchemaElement(tse, parent, formatOptions, containerType);

                        AddFlags(mse, tse);
                    }

                    int mrl = mse.MaxRepetitionLevel;
                    int mdl = mse.MaxDefinitionLevel;
                    node.Children.Add(mse);
                    mse.MaxRepetitionLevel = mrl;
                    mse.MaxDefinitionLevel = mdl;

                    i += 1;

                    if (tse.Num_children > 0)
                    {
                        Build(mse, ref i, tse.Num_children, false);
                    }
                }
            }

            //extract schema tree
            var root = new SchemaElement <int>("root")
            {
                Path = string.Empty
            };

            root.AutoUpdateLevels = false;
            int start = 1;

            Build(root, ref start, _fileMeta.Schema[0].Num_children, true);

            foreach (SchemaElement se in root.Children)
            {
                se.Detach();
            }
            root.AutoUpdateLevels = true;

            return(new Schema(root.Children));
        }
Exemple #10
0
        private SchemaElement BuildListSchema(ref Thrift.SchemaElement tse, ref int i, bool isRoot, SchemaElement node, ParquetOptions formatOptions)
        {
            Thrift.SchemaElement tseTop     = tse;
            Thrift.SchemaElement tseList    = _fileMeta.Schema[++i];
            Thrift.SchemaElement tseElement = _fileMeta.Schema[++i];

            SchemaElement mse = BuildSchemaElement(tseElement,
                                                   isRoot ? null : node,
                                                   formatOptions,
                                                   tseElement.Num_children == 0
               ? typeof(IEnumerable)   //augmented to generic IEnumerable in constructor
               : typeof(IEnumerable <Row>),
                                                   tseTop.Name);

            mse.Path       = string.Join(Schema.PathSeparator, tseTop.Name, tseList.Name, tseElement.Name);
            mse.IsRepeated = true;
            if (!isRoot)
            {
                mse.Path = node.Path + Schema.PathSeparator + mse.Path;
            }

            AddFlags(mse, tseTop, tseList, tseElement);

            tse = tseElement;

            return(mse);
        }
Exemple #11
0
        private SchemaElement BuildMapSchema(ref Thrift.SchemaElement tse, ref int i, bool isRoot, SchemaElement node, ParquetOptions formatOptions)
        {
            //tse is followed by map container (REPEATED) and another two elements - key and value

            Thrift.SchemaElement tseContainer = _fileMeta.Schema[++i];
            Thrift.SchemaElement tseKey       = _fileMeta.Schema[++i];
            Thrift.SchemaElement tseValue     = _fileMeta.Schema[++i];

            Type keyType    = TypePrimitive.GetSystemTypeBySchema(tseKey, formatOptions);
            Type valueType  = TypePrimitive.GetSystemTypeBySchema(tseValue, formatOptions);
            Type gt         = typeof(Dictionary <,>);
            Type masterType = gt.MakeGenericType(keyType, valueType);

            //master schema
            var se = new SchemaElement(tseContainer, tse.Name, masterType, masterType,
                                       string.Join(Schema.PathSeparator, tse.Name, tseContainer.Name));

            if (!isRoot)
            {
                se.Path = node.Parent + Schema.PathSeparator + se.Path;
            }
            se.Parent = node;
            se.IsMap  = true;
            AddFlags(se, tse, tseContainer);

            //extra schamas
            var kse = new SchemaElement(tseKey, null, keyType, keyType, null)
            {
                Parent = se
            };
            var vse = new SchemaElement(tseValue, null, valueType, valueType, null)
            {
                Parent = se
            };

            se.Extra.Add(kse);
            se.Extra.Add(vse);
            AddFlags(kse, tseKey);
            AddFlags(vse, tseValue);

            tse = tseValue;
            return(se);
        }
Exemple #12
0
        private List <PageTag> WriteValues(SchemaElement schema, IList values, Thrift.PageHeader ph, CompressionMethod compression)
        {
            var result = new List <PageTag>();

            byte[] dictionaryPageBytes = null;
            int    dictionaryPageCount = 0;

            byte[] dataPageBytes;

            //flatten values if the field is repeatable
            if (values != null && schema.IsRepeated)
            {
                values = FlattenRepeatables(values, schema);
            }

            using (var ms = new MemoryStream())
            {
                using (var writer = new BinaryWriter(ms))
                {
                    //write repetitions
                    if (schema.IsRepeated)
                    {
                        List <int> repetitions = CreateRepetitions(values, schema);
                        _rleWriter.Write(writer, _definitionsSchema, repetitions, out IList nullExtra);
                    }

                    //write definitions
                    if (schema.IsNullable || schema.IsRepeated)
                    {
                        CreateDefinitions(values, schema, out IList newValues, out List <int> definitions);
                        values = newValues;

                        _rleWriter.Write(writer, _definitionsSchema, definitions, out IList nullExtra);
                    }

                    //write data
                    if (!_writerOptions.UseDictionaryEncoding || !_dicWriter.Write(writer, schema, values, out IList dicValues))
                    {
                        _plainWriter.Write(writer, schema, values, out IList plainExtra);
                    }
                    else
                    {
                        dictionaryPageCount          = dicValues.Count;
                        ph.Data_page_header.Encoding = Thrift.Encoding.PLAIN_DICTIONARY;
                        using (var dms = new MemoryStream())
                            using (var dwriter = new BinaryWriter(dms))
                            {
                                _plainWriter.Write(dwriter, schema, dicValues, out IList t0);
                                dictionaryPageBytes = dms.ToArray();
                            }
                    }

                    dataPageBytes = ms.ToArray();
                }
            }

            if (dictionaryPageBytes != null)
            {
                Thrift.PageHeader dph = _meta.CreateDictionaryPage(dictionaryPageCount);
                dictionaryPageBytes = Compress(dph, dictionaryPageBytes, compression);
                int dictionaryHeaderSize = Write(dph, dictionaryPageBytes);
                result.Add(new PageTag {
                    HeaderSize = dictionaryHeaderSize, HeaderMeta = dph
                });
            }

            dataPageBytes = Compress(ph, dataPageBytes, compression);
            int dataHeaderSize = Write(ph, dataPageBytes);

            result.Add(new PageTag {
                HeaderSize = dataHeaderSize, HeaderMeta = ph
            });

            return(result);
        }
Exemple #13
0
 public abstract SchemaAssociation createSchemaAssociation(SchemaElement owner, object objectToWrap);
Exemple #14
0
 public ValueMerger(SchemaElement schema, ParquetOptions formatOptions, IList values)
 {
     _schema        = schema;
     _formatOptions = formatOptions;
     _values        = values;
 }
Exemple #15
0
 public override byte[] PlainEncode(SchemaElement tse, bool x) => null;
Exemple #16
0
        // ------------------------------------------
        // ACCESSORS
        // ------------------------------------------

        #region Accessors

        /// <summary>
        /// Gets the schema element with the specified ID.
        /// </summary>
        /// <param name="id">The ID of the meta object to consider.</param>
        /// <param name="parentMetobject1">The parent meta object to consider.</param>
        /// <returns>The bmeta object with the specified ID.</returns>
        public SchemaElement GetElementWithId(String id, SchemaElement parentMetobject1 = null)
        {
            return(RootZone?.GetElementWithId(id));
        }
Exemple #17
0
 public override object PlainDecode(SchemaElement tse, byte[] encoded) => null;
 public abstract SchemaAssociation createSchemaAssociation(SchemaElement owner,object objectToWrap);
Exemple #19
0
 protected override SchemaElement CreateSimple(SchemaElement parent, Thrift.SchemaElement tse)
 {
     return(new SchemaElement(tse.Name, DataType.Float, parent));
 }
 public abstract SchemaProperty createSchemaProperty(SchemaElement owner,object objectToWrap);
Exemple #21
0
 public abstract SchemaProperty createSchemaProperty(SchemaElement owner, object objectToWrap);