예제 #1
0
            /// <summary>
            /// Compute the output schema of a <see cref="GroupTransform"/> given a input schema.
            /// </summary>
            /// <param name="sourceSchema">Input schema.</param>
            /// <returns>The associated output schema produced by <see cref="GroupTransform"/>.</returns>
            private Schema BuildOutputSchema(Schema sourceSchema)
            {
                // Create schema build. We will sequentially add group columns and then aggregated columns.
                var schemaBuilder = new SchemaBuilder();

                // Handle group(-key) columns. Those columns are used as keys to partition rows in the input data; specifically,
                // rows with the same key value will be merged into one row in the output data.
                foreach (var groupKeyColumnName in _groupColumns)
                {
                    schemaBuilder.AddColumn(groupKeyColumnName, sourceSchema[groupKeyColumnName].Type, sourceSchema[groupKeyColumnName].Metadata);
                }

                // Handle aggregated (aka keep) columns.
                foreach (var groupValueColumnName in _keepColumns)
                {
                    // Prepare column's metadata.
                    var metadataBuilder = new MetadataBuilder();
                    metadataBuilder.Add(sourceSchema[groupValueColumnName].Metadata,
                                        s => s == MetadataUtils.Kinds.IsNormalized || s == MetadataUtils.Kinds.KeyValues);

                    // Prepare column's type.
                    var aggregatedValueType = sourceSchema[groupValueColumnName].Type as PrimitiveType;
                    _ectx.CheckValue(aggregatedValueType, nameof(aggregatedValueType), "Columns being aggregated must be primitive types such as string, float, or integer");
                    var aggregatedResultType = new VectorType(aggregatedValueType);

                    // Add column into output schema.
                    schemaBuilder.AddColumn(groupValueColumnName, aggregatedResultType, metadataBuilder.GetMetadata());
                }

                return(schemaBuilder.GetSchema());
            }
예제 #2
0
            public UngroupBinding(IExceptionContext ectx, Schema inputSchema, UngroupMode mode, string[] pivotColumns)
            {
                Contracts.AssertValueOrNull(ectx);
                _ectx = ectx;
                _ectx.AssertValue(inputSchema);
                _ectx.AssertNonEmpty(pivotColumns);

                _inputSchema = inputSchema; // This also makes InputColumnCount valid.
                Mode         = mode;

                Bind(_ectx, inputSchema, pivotColumns, out _infos);

                _pivotIndex = Utils.CreateArray(InputColumnCount, -1);
                for (int i = 0; i < _infos.Length; i++)
                {
                    var info = _infos[i];
                    _ectx.Assert(_pivotIndex[info.Index] == -1);
                    _pivotIndex[info.Index] = i;
                }

                var schemaBuilder = new SchemaBuilder();

                // Iterate through input columns. Input columns which are not pivot columns will be copied to output schema with the same column index unchanged.
                // Input columns which are pivot columns would also be copied but with different data types and different metadata.
                for (int i = 0; i < InputColumnCount; ++i)
                {
                    if (_pivotIndex[i] < 0)
                    {
                        // i-th input column is not a pivot column. Let's do a naive copy.
                        schemaBuilder.AddColumn(inputSchema[i].Name, inputSchema[i].Type, inputSchema[i].Metadata);
                    }
                    else
                    {
                        // i-th input column is a pivot column. Let's calculate proper type and metadata for it.
                        var metadataBuilder = new MetadataBuilder();
                        metadataBuilder.Add(inputSchema[i].Metadata, metadataName => ShouldPreserveMetadata(metadataName));
                        // To explain the output type of pivot columns, let's consider a row
                        //   Age UserID
                        //   18  {"Amy", "Willy"}
                        // where "Age" and "UserID" are column names and 18/{"Amy", "Willy"} is "Age"/"UserID" column in this example row.
                        // If the only pivot column is "UserID", the ungroup may produce
                        //   Age UserID
                        //   18  "Amy"
                        //   18  "Willy"
                        // One can see that "UserID" column (in output data) has a type identical to the element's type of the "UserID" column in input data.
                        schemaBuilder.AddColumn(inputSchema[i].Name, inputSchema[i].Type.GetItemType(), metadataBuilder.GetMetadata());
                    }
                }
                OutputSchema = schemaBuilder.GetSchema();
            }
예제 #3
0
        public static Schema Create(SchemaShape shape)
        {
            var builder = new SchemaBuilder();

            for (int i = 0; i < shape.Count; ++i)
            {
                var metaBuilder     = new MetadataBuilder();
                var partialMetadata = shape[i].Metadata;
                for (int j = 0; j < partialMetadata.Count; ++j)
                {
                    var      metaColumnType = MakeColumnType(partialMetadata[i]);
                    Delegate del;
                    if (metaColumnType.IsVector)
                    {
                        del = Utils.MarshalInvoke(GetDefaultVectorGetter <int>, metaColumnType.ItemType.RawType);
                    }
                    else
                    {
                        del = Utils.MarshalInvoke(GetDefaultGetter <int>, metaColumnType.RawType);
                    }
                    metaBuilder.Add(partialMetadata[j].Name, metaColumnType, del);
                }
                builder.AddColumn(shape[i].Name, MakeColumnType(shape[i]));
            }
            return(builder.GetSchema());
        }
            public DataView(IHostEnvironment env, ArrayDataViewBuilder builder, int rowCount)
            {
                Contracts.AssertValue(env, "env");
                _host = env.Register("ArrayDataView");

                _host.AssertValue(builder);
                _host.Assert(rowCount >= 0);
                _host.Assert(builder._names.Count == builder._columns.Count);
                _columns = builder._columns.ToArray();

                var schemaBuilder = new SchemaBuilder();

                for (int i = 0; i < _columns.Length; i++)
                {
                    var meta = new MetadataBuilder();

                    if (builder._getSlotNames.TryGetValue(builder._names[i], out var slotNamesGetter))
                    {
                        meta.AddSlotNames(_columns[i].Type.VectorSize, slotNamesGetter);
                    }

                    if (builder._getKeyValues.TryGetValue(builder._names[i], out var keyValueGetter))
                    {
                        meta.AddKeyValues(_columns[i].Type.KeyCount, TextType.Instance, keyValueGetter);
                    }
                    schemaBuilder.AddColumn(builder._names[i], _columns[i].Type, meta.GetMetadata());
                }

                _schema   = schemaBuilder.GetSchema();
                _rowCount = rowCount;
            }
예제 #5
0
                /// <summary>
                /// Append label names to score column as its metadata.
                /// </summary>
                private Schema DecorateOutputSchema(Schema partialSchema, int scoreColumnIndex, VectorType labelNameType,
                                                    ValueGetter <VBuffer <T> > labelNameGetter, string labelNameKind)
                {
                    var builder = new SchemaBuilder();

                    // Sequentially add columns so that the order of them is not changed comparing with the schema in the mapper
                    // that computes score column.
                    for (int i = 0; i < partialSchema.ColumnCount; ++i)
                    {
                        var meta = new MetadataBuilder();
                        if (i == scoreColumnIndex)
                        {
                            // Add label names for score column.
                            meta.Add(partialSchema[i].Metadata, selector: s => s != labelNameKind);
                            meta.Add(labelNameKind, labelNameType, labelNameGetter);
                        }
                        else
                        {
                            // Copy all existing metadata because this transform only affects score column.
                            meta.Add(partialSchema[i].Metadata, selector: s => true);
                        }
                        // Instead of appending extra metadata to the existing score column, we create new one because
                        // metadata is read-only.
                        builder.AddColumn(partialSchema[i].Name, partialSchema[i].Type, meta.GetMetadata());
                    }
                    return(builder.GetSchema());
                }
예제 #6
0
            private Row(ColumnType type, Delegate getter)
            {
                var builder = new SchemaBuilder();

                builder.AddColumn("Foo", type, null);
                Schema  = builder.GetSchema();
                _getter = getter;
            }
예제 #7
0
        void SimpleTest()
        {
            var metadataBuilder = new MetadataBuilder();

            metadataBuilder.Add("M", NumberType.R4, (ref float v) => v = 484f);
            var schemaBuilder = new SchemaBuilder();

            schemaBuilder.AddColumn("A", new VectorType(NumberType.R4, 94));
            schemaBuilder.AddColumn("B", new KeyType(typeof(uint), 17));
            schemaBuilder.AddColumn("C", NumberType.I4, metadataBuilder.GetMetadata());

            var shape = SchemaShape.Create(schemaBuilder.GetSchema());

            var fakeSchema = FakeSchemaFactory.Create(shape);

            var columnA = fakeSchema[0];
            var columnB = fakeSchema[1];
            var columnC = fakeSchema[2];

            Assert.Equal("A", columnA.Name);
            Assert.Equal(NumberType.R4, columnA.Type.GetItemType());
            Assert.Equal(10, columnA.Type.GetValueCount());

            Assert.Equal("B", columnB.Name);
            Assert.Equal(DataKind.U4, columnB.Type.GetRawKind());
            Assert.Equal(10u, columnB.Type.GetKeyCount());

            Assert.Equal("C", columnC.Name);
            Assert.Equal(NumberType.I4, columnC.Type);

            var metaC = columnC.Metadata;

            Assert.Single(metaC.Schema);

            float mValue = -1;

            metaC.GetValue("M", ref mValue);
            Assert.Equal(default, mValue);
예제 #8
0
        public void Can_AddColumn()
        {
            string columnName = "MyUserId";

            _schemaBuilder
            .AddColumn(columnName);

            //Assert.IsTrue(_schemaBuilder.Columns.Count == 1);
            //Assert.AreEqual(columnName, _schemaBuilder.Columns[0].Name);
        }
예제 #9
0
            public FeatureNameCollectionBinding(FeatureNameCollection collection)
            {
                Contracts.CheckValue(collection, nameof(collection));

                _collection    = collection;
                _colType       = new VectorType(NumberType.R4, collection.Count);
                _slotNamesType = new VectorType(TextType.Instance, collection.Count);

                var metadataBuilder = new MetadataBuilder();

                metadataBuilder.Add(MetadataUtils.Kinds.SlotNames, _slotNamesType,
                                    (ref VBuffer <ReadOnlyMemory <char> > slotNames) => { GetSlotNames(0, ref slotNames); });
                var schemaBuilder = new SchemaBuilder();

                schemaBuilder.AddColumn(RoleMappedSchema.ColumnRole.Feature.Value, _colType, metadataBuilder.GetMetadata());
                FeatureNameCollectionSchema = schemaBuilder.GetSchema();
            }
예제 #10
0
        private static Schema CreateSchema(ColumnBindingsBase inputBindings)
        {
            Contracts.CheckValue(inputBindings, nameof(inputBindings));

            var builder = new SchemaBuilder();

            for (int i = 0; i < inputBindings.ColumnCount; i++)
            {
                var meta = new MetadataBuilder();
                foreach (var kvp in inputBindings.GetMetadataTypes(i))
                {
                    var getter = Utils.MarshalInvoke(GetMetadataGetterDelegate <int>, kvp.Value.RawType, inputBindings, i, kvp.Key);
                    meta.Add(kvp.Key, kvp.Value, getter);
                }
                builder.AddColumn(inputBindings.GetColumnName(i), inputBindings.GetColumnType(i), meta.GetMetadata());
            }

            return(builder.GetSchema());
        }
예제 #11
0
            /// <summary>
            /// After <see cref="_sourceSchema"/> and <see cref="_sources"/> are set, pick up selected columns from <see cref="_sourceSchema"/> to create <see cref="OutputSchema"/>
            /// Note that <see cref="_sources"/> tells us what columns in <see cref="_sourceSchema"/> are put into <see cref="OutputSchema"/>.
            /// </summary>
            private Schema ComputeOutputSchema()
            {
                var schemaBuilder = new SchemaBuilder();

                for (int i = 0; i < _sources.Length; ++i)
                {
                    // selectedIndex is an column index of input schema. Note that the input column indexed by _sources[i] in _sourceSchema is sent
                    // to the i-th column in the output schema.
                    var selectedIndex = _sources[i];

                    // The dropped/kept columns are determined by user-specified arguments, so we throw if a bad configuration is provided.
                    string fmt = string.Format("Column index {0} invalid for input with {1} columns", selectedIndex, _sourceSchema.Count);
                    Contracts.Check(selectedIndex < _sourceSchema.Count, fmt);

                    // Copy the selected column into output schema.
                    var selectedColumn = _sourceSchema[selectedIndex];
                    schemaBuilder.AddColumn(selectedColumn.Name, selectedColumn.Type, selectedColumn.Metadata);
                }
                return(schemaBuilder.GetSchema());
            }
 private IDataView ExecutePipeline()
 {
     if (_preview == null)
     {
         if (_pipeline != null)
         {
             try
             {
                 _preview = _pipeline.Execute(_environment);
             }
             catch (Exception e)
             {
                 _pipelineExecutionException = e;
                 var builder = new SchemaBuilder();
                 builder.AddColumn("Blank", TextType.Instance);
                 _preview = new EmptyDataView(_environment, builder.GetSchema());
             }
         }
     }
     return(_preview);
 }
            public RowMapper(IHostEnvironment env, BindableMapper parent, RoleMappedSchema schema)
            {
                Contracts.AssertValue(env);
                _env = env;
                _env.AssertValue(schema);
                _env.AssertValue(parent);
                _env.AssertValue(schema.Feature);
                _parent = parent;
                InputRoleMappedSchema = schema;
                var genericMapper = parent.GenericMapper.Bind(_env, schema);

                _genericRowMapper = genericMapper as ISchemaBoundRowMapper;

                if (parent.Stringify)
                {
                    var builder = new SchemaBuilder();
                    builder.AddColumn(DefaultColumnNames.FeatureContributions, TextType.Instance, null);
                    _outputSchema = builder.GetSchema();
                    if (InputSchema.HasSlotNames(InputRoleMappedSchema.Feature.Index, InputRoleMappedSchema.Feature.Type.VectorSize))
                    {
                        InputSchema.GetMetadata(MetadataUtils.Kinds.SlotNames, InputRoleMappedSchema.Feature.Index,
                                                ref _slotNames);
                    }
                    else
                    {
                        _slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(InputRoleMappedSchema.Feature.Type.VectorSize);
                    }
                }
                else
                {
                    _outputSchema = Schema.Create(new FeatureContributionSchema(_env, DefaultColumnNames.FeatureContributions,
                                                                                new VectorType(NumberType.R4, schema.Feature.Type as VectorType),
                                                                                InputSchema, InputRoleMappedSchema.Feature.Index));
                }

                _outputGenericSchema = _genericRowMapper.OutputSchema;
                OutputSchema         = new CompositeSchema(new Schema[] { _outputGenericSchema, _outputSchema, }).AsSchema;
            }
예제 #14
0
        internal static DataViewSchema GetModelSchema(IExceptionContext ectx, TFGraph graph, string opType = null)
        {
            var schemaBuilder = new SchemaBuilder();

            foreach (var op in graph)
            {
                if (opType != null && opType != op.OpType)
                {
                    continue;
                }

                var tfType = op[0].OutputType;
                // Determine element type in Tensorflow tensor. For example, a vector of floats may get NumberType.R4 here.
                var mlType = Tf2MlNetTypeOrNull(tfType);

                // If the type is not supported in ML.NET then we cannot represent it as a column in an Schema.
                // We also cannot output it with a TensorFlowTransform, so we skip it.
                // Furthermore, operators which have NumOutputs <= 0 needs to be filtered.
                // The 'GetTensorShape' method crashes TensorFlow runtime
                // (https://github.com/dotnet/machinelearning/issues/2156) when the operator has no outputs.
                if (mlType == null || op.NumOutputs <= 0)
                {
                    continue;
                }

                // Construct the final ML.NET type of a Tensorflow variable.
                var tensorShape = graph.GetTensorShape(op[0]).ToIntArray();
                var columnType  = new VectorType(mlType);
                if (!(Utils.Size(tensorShape) == 1 && tensorShape[0] <= 0) &&
                    (Utils.Size(tensorShape) > 0 && tensorShape.Skip(1).All(x => x > 0)))
                {
                    columnType = new VectorType(mlType, tensorShape[0] > 0 ? tensorShape : tensorShape.Skip(1).ToArray());
                }

                // There can be at most two metadata fields.
                //  1. The first field always presents. Its value is this operator's type. For example,
                //     if an output is produced by an "Softmax" operator, the value of this field should be "Softmax".
                //  2. The second field stores operators whose outputs are consumed by this operator. In other words,
                //     these values are names of some upstream operators which should be evaluated before executing
                //     the current operator. It's possible that one operator doesn't need any input, so this field
                //     can be missing.
                var metadataBuilder = new MetadataBuilder();
                // Create the first metadata field.
                metadataBuilder.Add(TensorflowOperatorTypeKind, TextDataViewType.Instance, (ref ReadOnlyMemory <char> value) => value = op.OpType.AsMemory());
                if (op.NumInputs > 0)
                {
                    // Put upstream operators' names to an array (type: VBuffer) of string (type: ReadOnlyMemory<char>).
                    VBuffer <ReadOnlyMemory <char> > upstreamOperatorNames = default;
                    var bufferEditor = VBufferEditor.Create(ref upstreamOperatorNames, op.NumInputs);
                    for (int i = 0; i < op.NumInputs; ++i)
                    {
                        bufferEditor.Values[i] = op.GetInput(i).Operation.Name.AsMemory();
                    }
                    upstreamOperatorNames = bufferEditor.Commit(); // Used in metadata's getter.

                    // Create the second metadata field.
                    metadataBuilder.Add(TensorflowUpstreamOperatorsKind, new VectorType(TextDataViewType.Instance, op.NumInputs),
                                        (ref VBuffer <ReadOnlyMemory <char> > value) => { upstreamOperatorNames.CopyTo(ref value); });
                }

                schemaBuilder.AddColumn(op.Name, columnType, metadataBuilder.GetMetadata());
            }
            return(schemaBuilder.GetSchema());
        }