/// <summary> /// Compute the output schema of a <see cref="GroupTransform"/> given a input schema. /// </summary> /// <param name="sourceSchema">Input schema.</param> /// <returns>The associated output schema produced by <see cref="GroupTransform"/>.</returns> private Schema BuildOutputSchema(Schema sourceSchema) { // Create schema build. We will sequentially add group columns and then aggregated columns. var schemaBuilder = new SchemaBuilder(); // Handle group(-key) columns. Those columns are used as keys to partition rows in the input data; specifically, // rows with the same key value will be merged into one row in the output data. foreach (var groupKeyColumnName in _groupColumns) { schemaBuilder.AddColumn(groupKeyColumnName, sourceSchema[groupKeyColumnName].Type, sourceSchema[groupKeyColumnName].Metadata); } // Handle aggregated (aka keep) columns. foreach (var groupValueColumnName in _keepColumns) { // Prepare column's metadata. var metadataBuilder = new MetadataBuilder(); metadataBuilder.Add(sourceSchema[groupValueColumnName].Metadata, s => s == MetadataUtils.Kinds.IsNormalized || s == MetadataUtils.Kinds.KeyValues); // Prepare column's type. var aggregatedValueType = sourceSchema[groupValueColumnName].Type as PrimitiveType; _ectx.CheckValue(aggregatedValueType, nameof(aggregatedValueType), "Columns being aggregated must be primitive types such as string, float, or integer"); var aggregatedResultType = new VectorType(aggregatedValueType); // Add column into output schema. schemaBuilder.AddColumn(groupValueColumnName, aggregatedResultType, metadataBuilder.GetMetadata()); } return(schemaBuilder.GetSchema()); }
public UngroupBinding(IExceptionContext ectx, Schema inputSchema, UngroupMode mode, string[] pivotColumns) { Contracts.AssertValueOrNull(ectx); _ectx = ectx; _ectx.AssertValue(inputSchema); _ectx.AssertNonEmpty(pivotColumns); _inputSchema = inputSchema; // This also makes InputColumnCount valid. Mode = mode; Bind(_ectx, inputSchema, pivotColumns, out _infos); _pivotIndex = Utils.CreateArray(InputColumnCount, -1); for (int i = 0; i < _infos.Length; i++) { var info = _infos[i]; _ectx.Assert(_pivotIndex[info.Index] == -1); _pivotIndex[info.Index] = i; } var schemaBuilder = new SchemaBuilder(); // Iterate through input columns. Input columns which are not pivot columns will be copied to output schema with the same column index unchanged. // Input columns which are pivot columns would also be copied but with different data types and different metadata. for (int i = 0; i < InputColumnCount; ++i) { if (_pivotIndex[i] < 0) { // i-th input column is not a pivot column. Let's do a naive copy. schemaBuilder.AddColumn(inputSchema[i].Name, inputSchema[i].Type, inputSchema[i].Metadata); } else { // i-th input column is a pivot column. Let's calculate proper type and metadata for it. var metadataBuilder = new MetadataBuilder(); metadataBuilder.Add(inputSchema[i].Metadata, metadataName => ShouldPreserveMetadata(metadataName)); // To explain the output type of pivot columns, let's consider a row // Age UserID // 18 {"Amy", "Willy"} // where "Age" and "UserID" are column names and 18/{"Amy", "Willy"} is "Age"/"UserID" column in this example row. // If the only pivot column is "UserID", the ungroup may produce // Age UserID // 18 "Amy" // 18 "Willy" // One can see that "UserID" column (in output data) has a type identical to the element's type of the "UserID" column in input data. schemaBuilder.AddColumn(inputSchema[i].Name, inputSchema[i].Type.GetItemType(), metadataBuilder.GetMetadata()); } } OutputSchema = schemaBuilder.GetSchema(); }
public static Schema Create(SchemaShape shape) { var builder = new SchemaBuilder(); for (int i = 0; i < shape.Count; ++i) { var metaBuilder = new MetadataBuilder(); var partialMetadata = shape[i].Metadata; for (int j = 0; j < partialMetadata.Count; ++j) { var metaColumnType = MakeColumnType(partialMetadata[i]); Delegate del; if (metaColumnType.IsVector) { del = Utils.MarshalInvoke(GetDefaultVectorGetter <int>, metaColumnType.ItemType.RawType); } else { del = Utils.MarshalInvoke(GetDefaultGetter <int>, metaColumnType.RawType); } metaBuilder.Add(partialMetadata[j].Name, metaColumnType, del); } builder.AddColumn(shape[i].Name, MakeColumnType(shape[i])); } return(builder.GetSchema()); }
public DataView(IHostEnvironment env, ArrayDataViewBuilder builder, int rowCount) { Contracts.AssertValue(env, "env"); _host = env.Register("ArrayDataView"); _host.AssertValue(builder); _host.Assert(rowCount >= 0); _host.Assert(builder._names.Count == builder._columns.Count); _columns = builder._columns.ToArray(); var schemaBuilder = new SchemaBuilder(); for (int i = 0; i < _columns.Length; i++) { var meta = new MetadataBuilder(); if (builder._getSlotNames.TryGetValue(builder._names[i], out var slotNamesGetter)) { meta.AddSlotNames(_columns[i].Type.VectorSize, slotNamesGetter); } if (builder._getKeyValues.TryGetValue(builder._names[i], out var keyValueGetter)) { meta.AddKeyValues(_columns[i].Type.KeyCount, TextType.Instance, keyValueGetter); } schemaBuilder.AddColumn(builder._names[i], _columns[i].Type, meta.GetMetadata()); } _schema = schemaBuilder.GetSchema(); _rowCount = rowCount; }
/// <summary> /// Append label names to score column as its metadata. /// </summary> private Schema DecorateOutputSchema(Schema partialSchema, int scoreColumnIndex, VectorType labelNameType, ValueGetter <VBuffer <T> > labelNameGetter, string labelNameKind) { var builder = new SchemaBuilder(); // Sequentially add columns so that the order of them is not changed comparing with the schema in the mapper // that computes score column. for (int i = 0; i < partialSchema.ColumnCount; ++i) { var meta = new MetadataBuilder(); if (i == scoreColumnIndex) { // Add label names for score column. meta.Add(partialSchema[i].Metadata, selector: s => s != labelNameKind); meta.Add(labelNameKind, labelNameType, labelNameGetter); } else { // Copy all existing metadata because this transform only affects score column. meta.Add(partialSchema[i].Metadata, selector: s => true); } // Instead of appending extra metadata to the existing score column, we create new one because // metadata is read-only. builder.AddColumn(partialSchema[i].Name, partialSchema[i].Type, meta.GetMetadata()); } return(builder.GetSchema()); }
private Row(ColumnType type, Delegate getter) { var builder = new SchemaBuilder(); builder.AddColumn("Foo", type, null); Schema = builder.GetSchema(); _getter = getter; }
void SimpleTest() { var metadataBuilder = new MetadataBuilder(); metadataBuilder.Add("M", NumberType.R4, (ref float v) => v = 484f); var schemaBuilder = new SchemaBuilder(); schemaBuilder.AddColumn("A", new VectorType(NumberType.R4, 94)); schemaBuilder.AddColumn("B", new KeyType(typeof(uint), 17)); schemaBuilder.AddColumn("C", NumberType.I4, metadataBuilder.GetMetadata()); var shape = SchemaShape.Create(schemaBuilder.GetSchema()); var fakeSchema = FakeSchemaFactory.Create(shape); var columnA = fakeSchema[0]; var columnB = fakeSchema[1]; var columnC = fakeSchema[2]; Assert.Equal("A", columnA.Name); Assert.Equal(NumberType.R4, columnA.Type.GetItemType()); Assert.Equal(10, columnA.Type.GetValueCount()); Assert.Equal("B", columnB.Name); Assert.Equal(DataKind.U4, columnB.Type.GetRawKind()); Assert.Equal(10u, columnB.Type.GetKeyCount()); Assert.Equal("C", columnC.Name); Assert.Equal(NumberType.I4, columnC.Type); var metaC = columnC.Metadata; Assert.Single(metaC.Schema); float mValue = -1; metaC.GetValue("M", ref mValue); Assert.Equal(default, mValue);
public void Can_AddColumn() { string columnName = "MyUserId"; _schemaBuilder .AddColumn(columnName); //Assert.IsTrue(_schemaBuilder.Columns.Count == 1); //Assert.AreEqual(columnName, _schemaBuilder.Columns[0].Name); }
public FeatureNameCollectionBinding(FeatureNameCollection collection) { Contracts.CheckValue(collection, nameof(collection)); _collection = collection; _colType = new VectorType(NumberType.R4, collection.Count); _slotNamesType = new VectorType(TextType.Instance, collection.Count); var metadataBuilder = new MetadataBuilder(); metadataBuilder.Add(MetadataUtils.Kinds.SlotNames, _slotNamesType, (ref VBuffer <ReadOnlyMemory <char> > slotNames) => { GetSlotNames(0, ref slotNames); }); var schemaBuilder = new SchemaBuilder(); schemaBuilder.AddColumn(RoleMappedSchema.ColumnRole.Feature.Value, _colType, metadataBuilder.GetMetadata()); FeatureNameCollectionSchema = schemaBuilder.GetSchema(); }
private static Schema CreateSchema(ColumnBindingsBase inputBindings) { Contracts.CheckValue(inputBindings, nameof(inputBindings)); var builder = new SchemaBuilder(); for (int i = 0; i < inputBindings.ColumnCount; i++) { var meta = new MetadataBuilder(); foreach (var kvp in inputBindings.GetMetadataTypes(i)) { var getter = Utils.MarshalInvoke(GetMetadataGetterDelegate <int>, kvp.Value.RawType, inputBindings, i, kvp.Key); meta.Add(kvp.Key, kvp.Value, getter); } builder.AddColumn(inputBindings.GetColumnName(i), inputBindings.GetColumnType(i), meta.GetMetadata()); } return(builder.GetSchema()); }
/// <summary> /// After <see cref="_sourceSchema"/> and <see cref="_sources"/> are set, pick up selected columns from <see cref="_sourceSchema"/> to create <see cref="OutputSchema"/> /// Note that <see cref="_sources"/> tells us what columns in <see cref="_sourceSchema"/> are put into <see cref="OutputSchema"/>. /// </summary> private Schema ComputeOutputSchema() { var schemaBuilder = new SchemaBuilder(); for (int i = 0; i < _sources.Length; ++i) { // selectedIndex is an column index of input schema. Note that the input column indexed by _sources[i] in _sourceSchema is sent // to the i-th column in the output schema. var selectedIndex = _sources[i]; // The dropped/kept columns are determined by user-specified arguments, so we throw if a bad configuration is provided. string fmt = string.Format("Column index {0} invalid for input with {1} columns", selectedIndex, _sourceSchema.Count); Contracts.Check(selectedIndex < _sourceSchema.Count, fmt); // Copy the selected column into output schema. var selectedColumn = _sourceSchema[selectedIndex]; schemaBuilder.AddColumn(selectedColumn.Name, selectedColumn.Type, selectedColumn.Metadata); } return(schemaBuilder.GetSchema()); }
private IDataView ExecutePipeline() { if (_preview == null) { if (_pipeline != null) { try { _preview = _pipeline.Execute(_environment); } catch (Exception e) { _pipelineExecutionException = e; var builder = new SchemaBuilder(); builder.AddColumn("Blank", TextType.Instance); _preview = new EmptyDataView(_environment, builder.GetSchema()); } } } return(_preview); }
public RowMapper(IHostEnvironment env, BindableMapper parent, RoleMappedSchema schema) { Contracts.AssertValue(env); _env = env; _env.AssertValue(schema); _env.AssertValue(parent); _env.AssertValue(schema.Feature); _parent = parent; InputRoleMappedSchema = schema; var genericMapper = parent.GenericMapper.Bind(_env, schema); _genericRowMapper = genericMapper as ISchemaBoundRowMapper; if (parent.Stringify) { var builder = new SchemaBuilder(); builder.AddColumn(DefaultColumnNames.FeatureContributions, TextType.Instance, null); _outputSchema = builder.GetSchema(); if (InputSchema.HasSlotNames(InputRoleMappedSchema.Feature.Index, InputRoleMappedSchema.Feature.Type.VectorSize)) { InputSchema.GetMetadata(MetadataUtils.Kinds.SlotNames, InputRoleMappedSchema.Feature.Index, ref _slotNames); } else { _slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(InputRoleMappedSchema.Feature.Type.VectorSize); } } else { _outputSchema = Schema.Create(new FeatureContributionSchema(_env, DefaultColumnNames.FeatureContributions, new VectorType(NumberType.R4, schema.Feature.Type as VectorType), InputSchema, InputRoleMappedSchema.Feature.Index)); } _outputGenericSchema = _genericRowMapper.OutputSchema; OutputSchema = new CompositeSchema(new Schema[] { _outputGenericSchema, _outputSchema, }).AsSchema; }
internal static DataViewSchema GetModelSchema(IExceptionContext ectx, TFGraph graph, string opType = null) { var schemaBuilder = new SchemaBuilder(); foreach (var op in graph) { if (opType != null && opType != op.OpType) { continue; } var tfType = op[0].OutputType; // Determine element type in Tensorflow tensor. For example, a vector of floats may get NumberType.R4 here. var mlType = Tf2MlNetTypeOrNull(tfType); // If the type is not supported in ML.NET then we cannot represent it as a column in an Schema. // We also cannot output it with a TensorFlowTransform, so we skip it. // Furthermore, operators which have NumOutputs <= 0 needs to be filtered. // The 'GetTensorShape' method crashes TensorFlow runtime // (https://github.com/dotnet/machinelearning/issues/2156) when the operator has no outputs. if (mlType == null || op.NumOutputs <= 0) { continue; } // Construct the final ML.NET type of a Tensorflow variable. var tensorShape = graph.GetTensorShape(op[0]).ToIntArray(); var columnType = new VectorType(mlType); if (!(Utils.Size(tensorShape) == 1 && tensorShape[0] <= 0) && (Utils.Size(tensorShape) > 0 && tensorShape.Skip(1).All(x => x > 0))) { columnType = new VectorType(mlType, tensorShape[0] > 0 ? tensorShape : tensorShape.Skip(1).ToArray()); } // There can be at most two metadata fields. // 1. The first field always presents. Its value is this operator's type. For example, // if an output is produced by an "Softmax" operator, the value of this field should be "Softmax". // 2. The second field stores operators whose outputs are consumed by this operator. In other words, // these values are names of some upstream operators which should be evaluated before executing // the current operator. It's possible that one operator doesn't need any input, so this field // can be missing. var metadataBuilder = new MetadataBuilder(); // Create the first metadata field. metadataBuilder.Add(TensorflowOperatorTypeKind, TextDataViewType.Instance, (ref ReadOnlyMemory <char> value) => value = op.OpType.AsMemory()); if (op.NumInputs > 0) { // Put upstream operators' names to an array (type: VBuffer) of string (type: ReadOnlyMemory<char>). VBuffer <ReadOnlyMemory <char> > upstreamOperatorNames = default; var bufferEditor = VBufferEditor.Create(ref upstreamOperatorNames, op.NumInputs); for (int i = 0; i < op.NumInputs; ++i) { bufferEditor.Values[i] = op.GetInput(i).Operation.Name.AsMemory(); } upstreamOperatorNames = bufferEditor.Commit(); // Used in metadata's getter. // Create the second metadata field. metadataBuilder.Add(TensorflowUpstreamOperatorsKind, new VectorType(TextDataViewType.Instance, op.NumInputs), (ref VBuffer <ReadOnlyMemory <char> > value) => { upstreamOperatorNames.CopyTo(ref value); }); } schemaBuilder.AddColumn(op.Name, columnType, metadataBuilder.GetMetadata()); } return(schemaBuilder.GetSchema()); }