/// <summary> /// Append label names to score column as its metadata. /// </summary> private Schema DecorateOutputSchema(Schema partialSchema, int scoreColumnIndex, VectorType labelNameType, ValueGetter <VBuffer <T> > labelNameGetter, string labelNameKind) { var builder = new SchemaBuilder(); // Sequentially add columns so that the order of them is not changed comparing with the schema in the mapper // that computes score column. for (int i = 0; i < partialSchema.ColumnCount; ++i) { var meta = new MetadataBuilder(); if (i == scoreColumnIndex) { // Add label names for score column. meta.Add(partialSchema[i].Metadata, selector: s => s != labelNameKind); meta.Add(labelNameKind, labelNameType, labelNameGetter); } else { // Copy all existing metadata because this transform only affects score column. meta.Add(partialSchema[i].Metadata, selector: s => true); } // Instead of appending extra metadata to the existing score column, we create new one because // metadata is read-only. builder.AddColumn(partialSchema[i].Name, partialSchema[i].Type, meta.GetMetadata()); } return(builder.GetSchema()); }
/// <summary> /// Compute the output schema of a <see cref="GroupTransform"/> given a input schema. /// </summary> /// <param name="sourceSchema">Input schema.</param> /// <returns>The associated output schema produced by <see cref="GroupTransform"/>.</returns> private Schema BuildOutputSchema(Schema sourceSchema) { // Create schema build. We will sequentially add group columns and then aggregated columns. var schemaBuilder = new SchemaBuilder(); // Handle group(-key) columns. Those columns are used as keys to partition rows in the input data; specifically, // rows with the same key value will be merged into one row in the output data. foreach (var groupKeyColumnName in _groupColumns) { schemaBuilder.AddColumn(groupKeyColumnName, sourceSchema[groupKeyColumnName].Type, sourceSchema[groupKeyColumnName].Metadata); } // Handle aggregated (aka keep) columns. foreach (var groupValueColumnName in _keepColumns) { // Prepare column's metadata. var metadataBuilder = new MetadataBuilder(); metadataBuilder.Add(sourceSchema[groupValueColumnName].Metadata, s => s == MetadataUtils.Kinds.IsNormalized || s == MetadataUtils.Kinds.KeyValues); // Prepare column's type. var aggregatedValueType = sourceSchema[groupValueColumnName].Type as PrimitiveType; _ectx.CheckValue(aggregatedValueType, nameof(aggregatedValueType), "Columns being aggregated must be primitive types such as string, float, or integer"); var aggregatedResultType = new VectorType(aggregatedValueType); // Add column into output schema. schemaBuilder.AddColumn(groupValueColumnName, aggregatedResultType, metadataBuilder.GetMetadata()); } return(schemaBuilder.GetSchema()); }
public static Schema Create(SchemaShape shape) { var builder = new SchemaBuilder(); for (int i = 0; i < shape.Count; ++i) { var metaBuilder = new MetadataBuilder(); var partialMetadata = shape[i].Metadata; for (int j = 0; j < partialMetadata.Count; ++j) { var metaColumnType = MakeColumnType(partialMetadata[i]); Delegate del; if (metaColumnType.IsVector) { del = Utils.MarshalInvoke(GetDefaultVectorGetter <int>, metaColumnType.ItemType.RawType); } else { del = Utils.MarshalInvoke(GetDefaultGetter <int>, metaColumnType.RawType); } metaBuilder.Add(partialMetadata[j].Name, metaColumnType, del); } builder.AddColumn(shape[i].Name, MakeColumnType(shape[i])); } return(builder.GetSchema()); }
protected override Schema.DetachedColumn[] GetOutputColumnsCore() { var result = new Schema.DetachedColumn[_parent.ColumnPairs.Length]; for (int iinfo = 0; iinfo < _infos.Length; iinfo++) { InputSchema.TryGetColumnIndex(_infos[iinfo].Input, out int colIndex); Host.Assert(colIndex >= 0); var builder = new MetadataBuilder(); builder.Add(InputSchema[colIndex].Metadata, x => x == MetadataUtils.Kinds.SlotNames); ValueGetter <bool> getter = (ref bool dst) => { dst = true; }; builder.Add(MetadataUtils.Kinds.IsNormalized, BoolType.Instance, getter); result[iinfo] = new Schema.DetachedColumn(_infos[iinfo].Output, _infos[iinfo].OutputType, builder.GetMetadata()); } return(result); }
private void AddMetadata(int iinfo, MetadataBuilder builder) { builder.Add(InputSchema[_parent.ColumnPairs[iinfo].input].Metadata, name => name == MetadataUtils.Kinds.SlotNames); ValueGetter <VBuffer <ReadOnlyMemory <char> > > getter = (ref VBuffer <ReadOnlyMemory <char> > dst) => { GetKeyValues(iinfo, ref dst); }; builder.AddKeyValues(CharsCount, TextType.Instance, getter); }
public void InvalidChars() { MetadataBuilder b = new MetadataBuilder(); string invalidChars = "\t\0\n"; foreach (char invalidChar in invalidChars) { Assert.Throws <ArgumentException>(() => { b.Add("a", invalidChar.ToString(CultureInfo.InvariantCulture)); }); } }
protected override Schema.DetachedColumn[] GetOutputColumnsCore() { var result = new Schema.DetachedColumn[_parent.ColumnPairs.Length]; for (int i = 0; i < _parent.ColumnPairs.Length; i++) { var builder = new MetadataBuilder(); builder.Add(InputSchema[ColMapNewToOld[i]].Metadata, x => x == MetadataUtils.Kinds.KeyValues || x == MetadataUtils.Kinds.IsNormalized); result[i] = new Schema.DetachedColumn(_parent.ColumnPairs[i].outputColumnName, _types[i], builder.GetMetadata()); } return(result); }
protected override Schema.DetachedColumn[] GetOutputColumnsCore() { var result = new Schema.DetachedColumn[_parent.ColumnPairs.Length]; for (int i = 0; i < _parent.ColumnPairs.Length; i++) { var meta = new MetadataBuilder(); meta.Add(InputSchema[ColMapNewToOld[i]].Metadata, name => name == MetadataUtils.Kinds.SlotNames); result[i] = new Schema.DetachedColumn(_parent.ColumnPairs[i].output, _types[i], meta.GetMetadata()); } return(result); }
public UngroupBinding(IExceptionContext ectx, Schema inputSchema, UngroupMode mode, string[] pivotColumns) { Contracts.AssertValueOrNull(ectx); _ectx = ectx; _ectx.AssertValue(inputSchema); _ectx.AssertNonEmpty(pivotColumns); _inputSchema = inputSchema; // This also makes InputColumnCount valid. Mode = mode; Bind(_ectx, inputSchema, pivotColumns, out _infos); _pivotIndex = Utils.CreateArray(InputColumnCount, -1); for (int i = 0; i < _infos.Length; i++) { var info = _infos[i]; _ectx.Assert(_pivotIndex[info.Index] == -1); _pivotIndex[info.Index] = i; } var schemaBuilder = new SchemaBuilder(); // Iterate through input columns. Input columns which are not pivot columns will be copied to output schema with the same column index unchanged. // Input columns which are pivot columns would also be copied but with different data types and different metadata. for (int i = 0; i < InputColumnCount; ++i) { if (_pivotIndex[i] < 0) { // i-th input column is not a pivot column. Let's do a naive copy. schemaBuilder.AddColumn(inputSchema[i].Name, inputSchema[i].Type, inputSchema[i].Metadata); } else { // i-th input column is a pivot column. Let's calculate proper type and metadata for it. var metadataBuilder = new MetadataBuilder(); metadataBuilder.Add(inputSchema[i].Metadata, metadataName => ShouldPreserveMetadata(metadataName)); // To explain the output type of pivot columns, let's consider a row // Age UserID // 18 {"Amy", "Willy"} // where "Age" and "UserID" are column names and 18/{"Amy", "Willy"} is "Age"/"UserID" column in this example row. // If the only pivot column is "UserID", the ungroup may produce // Age UserID // 18 "Amy" // 18 "Willy" // One can see that "UserID" column (in output data) has a type identical to the element's type of the "UserID" column in input data. schemaBuilder.AddColumn(inputSchema[i].Name, inputSchema[i].Type.GetItemType(), metadataBuilder.GetMetadata()); } } OutputSchema = schemaBuilder.GetSchema(); }
public FeatureNameCollectionBinding(FeatureNameCollection collection) { Contracts.CheckValue(collection, nameof(collection)); _collection = collection; _colType = new VectorType(NumberType.R4, collection.Count); _slotNamesType = new VectorType(TextType.Instance, collection.Count); var metadataBuilder = new MetadataBuilder(); metadataBuilder.Add(MetadataUtils.Kinds.SlotNames, _slotNamesType, (ref VBuffer <ReadOnlyMemory <char> > slotNames) => { GetSlotNames(0, ref slotNames); }); var schemaBuilder = new SchemaBuilder(); schemaBuilder.AddColumn(RoleMappedSchema.ColumnRole.Feature.Value, _colType, metadataBuilder.GetMetadata()); FeatureNameCollectionSchema = schemaBuilder.GetSchema(); }
internal static Schema.DetachedColumn[] GetSchemaColumns(InternalSchemaDefinition schemaDefn) { Contracts.AssertValue(schemaDefn); var columns = new Schema.DetachedColumn[schemaDefn.Columns.Length]; for (int i = 0; i < columns.Length; i++) { var col = schemaDefn.Columns[i]; var meta = new MetadataBuilder(); foreach (var kvp in col.Metadata) { meta.Add(kvp.Value.Kind, kvp.Value.MetadataType, kvp.Value.GetGetterDelegate()); } columns[i] = new Schema.DetachedColumn(col.ColumnName, col.ColumnType, meta.GetMetadata()); } return(columns); }
private static Schema CreateSchema(ColumnBindingsBase inputBindings) { Contracts.CheckValue(inputBindings, nameof(inputBindings)); var builder = new SchemaBuilder(); for (int i = 0; i < inputBindings.ColumnCount; i++) { var meta = new MetadataBuilder(); foreach (var kvp in inputBindings.GetMetadataTypes(i)) { var getter = Utils.MarshalInvoke(GetMetadataGetterDelegate <int>, kvp.Value.RawType, inputBindings, i, kvp.Key); meta.Add(kvp.Key, kvp.Value, getter); } builder.AddColumn(inputBindings.GetColumnName(i), inputBindings.GetColumnType(i), meta.GetMetadata()); } return(builder.GetSchema()); }
void SimpleTest() { var metadataBuilder = new MetadataBuilder(); metadataBuilder.Add("M", NumberType.R4, (ref float v) => v = 484f); var schemaBuilder = new SchemaBuilder(); schemaBuilder.AddColumn("A", new VectorType(NumberType.R4, 94)); schemaBuilder.AddColumn("B", new KeyType(typeof(uint), 17)); schemaBuilder.AddColumn("C", NumberType.I4, metadataBuilder.GetMetadata()); var shape = SchemaShape.Create(schemaBuilder.GetSchema()); var fakeSchema = FakeSchemaFactory.Create(shape); var columnA = fakeSchema[0]; var columnB = fakeSchema[1]; var columnC = fakeSchema[2]; Assert.Equal("A", columnA.Name); Assert.Equal(NumberType.R4, columnA.Type.GetItemType()); Assert.Equal(10, columnA.Type.GetValueCount()); Assert.Equal("B", columnB.Name); Assert.Equal(DataKind.U4, columnB.Type.GetRawKind()); Assert.Equal(10u, columnB.Type.GetKeyCount()); Assert.Equal("C", columnC.Name); Assert.Equal(NumberType.I4, columnC.Type); var metaC = columnC.Metadata; Assert.Single(metaC.Schema); float mValue = -1; metaC.GetValue("M", ref mValue); Assert.Equal(default, mValue);
internal static DataViewSchema GetModelSchema(IExceptionContext ectx, TFGraph graph, string opType = null) { var schemaBuilder = new SchemaBuilder(); foreach (var op in graph) { if (opType != null && opType != op.OpType) { continue; } var tfType = op[0].OutputType; // Determine element type in Tensorflow tensor. For example, a vector of floats may get NumberType.R4 here. var mlType = Tf2MlNetTypeOrNull(tfType); // If the type is not supported in ML.NET then we cannot represent it as a column in an Schema. // We also cannot output it with a TensorFlowTransform, so we skip it. // Furthermore, operators which have NumOutputs <= 0 needs to be filtered. // The 'GetTensorShape' method crashes TensorFlow runtime // (https://github.com/dotnet/machinelearning/issues/2156) when the operator has no outputs. if (mlType == null || op.NumOutputs <= 0) { continue; } // Construct the final ML.NET type of a Tensorflow variable. var tensorShape = graph.GetTensorShape(op[0]).ToIntArray(); var columnType = new VectorType(mlType); if (!(Utils.Size(tensorShape) == 1 && tensorShape[0] <= 0) && (Utils.Size(tensorShape) > 0 && tensorShape.Skip(1).All(x => x > 0))) { columnType = new VectorType(mlType, tensorShape[0] > 0 ? tensorShape : tensorShape.Skip(1).ToArray()); } // There can be at most two metadata fields. // 1. The first field always presents. Its value is this operator's type. For example, // if an output is produced by an "Softmax" operator, the value of this field should be "Softmax". // 2. The second field stores operators whose outputs are consumed by this operator. In other words, // these values are names of some upstream operators which should be evaluated before executing // the current operator. It's possible that one operator doesn't need any input, so this field // can be missing. var metadataBuilder = new MetadataBuilder(); // Create the first metadata field. metadataBuilder.Add(TensorflowOperatorTypeKind, TextDataViewType.Instance, (ref ReadOnlyMemory <char> value) => value = op.OpType.AsMemory()); if (op.NumInputs > 0) { // Put upstream operators' names to an array (type: VBuffer) of string (type: ReadOnlyMemory<char>). VBuffer <ReadOnlyMemory <char> > upstreamOperatorNames = default; var bufferEditor = VBufferEditor.Create(ref upstreamOperatorNames, op.NumInputs); for (int i = 0; i < op.NumInputs; ++i) { bufferEditor.Values[i] = op.GetInput(i).Operation.Name.AsMemory(); } upstreamOperatorNames = bufferEditor.Commit(); // Used in metadata's getter. // Create the second metadata field. metadataBuilder.Add(TensorflowUpstreamOperatorsKind, new VectorType(TextDataViewType.Instance, op.NumInputs), (ref VBuffer <ReadOnlyMemory <char> > value) => { upstreamOperatorNames.CopyTo(ref value); }); } schemaBuilder.AddColumn(op.Name, columnType, metadataBuilder.GetMetadata()); } return(schemaBuilder.GetSchema()); }
private void HashTestCore <T>(T val, PrimitiveType type, uint expected, uint expectedOrdered, uint expectedOrdered3) { const int bits = 10; var builder = new MetadataBuilder(); builder.AddPrimitiveValue("Foo", type, val); var inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); // First do an unordered hash. var info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits); var xf = new HashingTransformer(Env, new[] { info }); var mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out int outCol); var outRow = mapper.GetRow(inRow, c => c == outCol); var getter = outRow.GetGetter <uint>(outCol); uint result = 0; getter(ref result); Assert.Equal(expected, result); // Next do an ordered hash. info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); getter = outRow.GetGetter <uint>(outCol); getter(ref result); Assert.Equal(expectedOrdered, result); // Next build up a vector to make sure that hashing is consistent between scalar values // at least in the first position, and in the unordered case, the last position. const int vecLen = 5; var denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val)); builder = new MetadataBuilder(); builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer <T> dst) => denseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); var vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); VBuffer <uint> vecResult = default; vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); // They all should equal this in this case. Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v)); // Now do ordered with the dense vector. info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0))); // Let's now do a sparse vector. var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 }); builder = new MetadataBuilder(); builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer <T> dst) => sparseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expected, vecResult.GetItemOrDefault(0)); Assert.Equal(expected, vecResult.GetItemOrDefault(3)); Assert.Equal(expected, vecResult.GetItemOrDefault(7)); info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); }