Ejemplo n.º 1
0
        private static DataViewSchema CreateSchema(ColumnBindingsBase inputBindings)
        {
            Contracts.CheckValue(inputBindings, nameof(inputBindings));

            var builder = new DataViewSchema.Builder();

            for (int i = 0; i < inputBindings.ColumnCount; i++)
            {
                var meta = new DataViewSchema.Annotations.Builder();
                foreach (var kvp in inputBindings.GetAnnotationTypes(i))
                {
                    var getter = Utils.MarshalInvoke(GetAnnotationGetterDelegate <int>, kvp.Value.RawType, inputBindings, i, kvp.Key);
                    meta.Add(kvp.Key, kvp.Value, getter);
                }
                builder.AddColumn(inputBindings.GetColumnName(i), inputBindings.GetColumnType(i), meta.ToAnnotations());
            }

            return(builder.ToSchema());
        }
Ejemplo n.º 2
0
        public void ValidateTextColumnNotText()
        {
            const string TextPurposeColName = "TextColumn";
            var          schemaBuilder      = new DataViewSchema.Builder();

            schemaBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single);
            schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single);
            schemaBuilder.AddColumn(TextPurposeColName, NumberDataViewType.Single);
            var schema   = schemaBuilder.ToSchema();
            var dataView = DataViewTestFixture.BuildDummyDataView(schema);

            var columnInfo = new ColumnInformation();

            columnInfo.TextColumnNames.Add(TextPurposeColName);

            var ex = Assert.Throws <ArgumentException>(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(dataView, columnInfo, null, TaskKind.Regression));

            Assert.Equal("Provided text column 'TextColumn' was of type Single, but only type String is allowed.", ex.Message);
        }
        /// <summary>
        /// Return a <see cref="DataViewSchema"/> which contains a single score column.
        /// </summary>
        /// <param name="scoreType">The type of the score column.</param>
        /// <param name="scoreColumnKindValue">The kind of the score column. It's the value of <see cref="AnnotationUtils.Kinds.ScoreColumnKind"/> in the score column's metadata.</param>
        /// <param name="scoreColumnName">The score column's name in the generated <see cref="DataViewSchema"/>.</param>
        /// <returns><see cref="DataViewSchema"/> which contains only one column.</returns>
        public static DataViewSchema Create(DataViewType scoreType, string scoreColumnKindValue, string scoreColumnName = AnnotationUtils.Const.ScoreValueKind.Score)
        {
            Contracts.CheckValue(scoreType, nameof(scoreType));
            Contracts.CheckNonEmpty(scoreColumnKindValue, nameof(scoreColumnKindValue));

            // Two metadata fields. One can set up by caller of this function while the other one is a constant.
            var metadataBuilder = new DataViewSchema.Annotations.Builder();

            metadataBuilder.Add(AnnotationUtils.Kinds.ScoreColumnKind, TextDataViewType.Instance,
                                (ref ReadOnlyMemory <char> value) => { value = scoreColumnKindValue.AsMemory(); });
            metadataBuilder.Add(AnnotationUtils.Kinds.ScoreValueKind, TextDataViewType.Instance,
                                (ref ReadOnlyMemory <char> value) => { value = AnnotationUtils.Const.ScoreValueKind.Score.AsMemory(); });

            // Build a schema consisting of a single column.
            var schemaBuilder = new DataViewSchema.Builder();

            schemaBuilder.AddColumn(scoreColumnName, scoreType, metadataBuilder.ToAnnotations());

            return(schemaBuilder.ToSchema());
        }
Ejemplo n.º 4
0
            /// <summary>
            /// After <see cref="_sourceSchema"/> and <see cref="_sources"/> are set, pick up selected columns from <see cref="_sourceSchema"/> to create <see cref="OutputSchema"/>
            /// Note that <see cref="_sources"/> tells us what columns in <see cref="_sourceSchema"/> are put into <see cref="OutputSchema"/>.
            /// </summary>
            private DataViewSchema ComputeOutputSchema()
            {
                var schemaBuilder = new DataViewSchema.Builder();

                for (int i = 0; i < _sources.Length; ++i)
                {
                    // selectedIndex is an column index of input schema. Note that the input column indexed by _sources[i] in _sourceSchema is sent
                    // to the i-th column in the output schema.
                    var selectedIndex = _sources[i];

                    // The dropped/kept columns are determined by user-specified arguments, so we throw if a bad configuration is provided.
                    string fmt = string.Format("Column index {0} invalid for input with {1} columns", selectedIndex, _sourceSchema.Count);
                    Contracts.Check(selectedIndex < _sourceSchema.Count, fmt);

                    // Copy the selected column into output schema.
                    var selectedColumn = _sourceSchema[selectedIndex];
                    schemaBuilder.AddColumn(selectedColumn.Name, selectedColumn.Type, selectedColumn.Annotations);
                }
                return(schemaBuilder.ToSchema());
            }
Ejemplo n.º 5
0
        public ZipBinding(DataViewSchema[] sources)
        {
            Contracts.AssertNonEmpty(sources);
            _sources                = sources;
            _cumulativeColCounts    = new int[_sources.Length + 1];
            _cumulativeColCounts[0] = 0;

            for (int i = 0; i < sources.Length; i++)
            {
                var schema = sources[i];
                _cumulativeColCounts[i + 1] = _cumulativeColCounts[i] + schema.Count;
            }

            var schemaBuilder = new DataViewSchema.Builder();

            foreach (var sourceSchema in sources)
            {
                schemaBuilder.AddColumns(sourceSchema);
            }
            OutputSchema = schemaBuilder.ToSchema();
        }
Ejemplo n.º 6
0
        public void ValidateEmptyValidationDataThrows()
        {
            // Training data
            var dataViewBuilder = new ArrayDataViewBuilder(new MLContext());

            dataViewBuilder.AddColumn("Number", NumberDataViewType.Single, 0f);
            dataViewBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, 0f);
            var trainingData = dataViewBuilder.GetDataView();

            // Validation data
            var schemaBuilder = new DataViewSchema.Builder();

            schemaBuilder.AddColumn("Number", NumberDataViewType.Single);
            schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single);
            var schema         = schemaBuilder.ToSchema();
            var validationData = DataViewTestFixture.BuildDummyDataView(schema, createDummyRow: false);

            var ex = Assert.Throws <ArgumentException>(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(trainingData, new ColumnInformation(),
                                                                                                                   validationData, TaskKind.Regression));

            Assert.StartsWith("Validation data has 0 rows", ex.Message);
        }
        void SimpleTest()
        {
            var metadataBuilder = new DataViewSchema.Annotations.Builder();

            metadataBuilder.Add("M", NumberDataViewType.Single, (ref float v) => v = 484f);
            var schemaBuilder = new DataViewSchema.Builder();

            schemaBuilder.AddColumn("A", new VectorDataViewType(NumberDataViewType.Single, 94));
            schemaBuilder.AddColumn("B", new KeyDataViewType(typeof(uint), 17));
            schemaBuilder.AddColumn("C", NumberDataViewType.Int32, metadataBuilder.ToAnnotations());

            var shape = SchemaShape.Create(schemaBuilder.ToSchema());

            var fakeSchema = FakeSchemaFactory.Create(shape);

            var columnA = fakeSchema[0];
            var columnB = fakeSchema[1];
            var columnC = fakeSchema[2];

            Assert.Equal("A", columnA.Name);
            Assert.Equal(NumberDataViewType.Single, columnA.Type.GetItemType());
            Assert.Equal(10, columnA.Type.GetValueCount());

            Assert.Equal("B", columnB.Name);
            Assert.Equal(InternalDataKind.U4, columnB.Type.GetRawKind());
            Assert.Equal(10u, columnB.Type.GetKeyCount());

            Assert.Equal("C", columnC.Name);
            Assert.Equal(NumberDataViewType.Int32, columnC.Type);

            var metaC = columnC.Annotations;

            Assert.Single(metaC.Schema);

            float mValue = -1;

            metaC.GetValue("M", ref mValue);
            Assert.Equal(default, mValue);
            public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper owner, RoleMappedSchema schema,
                               string treesColumnName, string leavesColumnName, string pathsColumnName)
            {
                Contracts.AssertValue(ectx);
                ectx.AssertValue(owner);
                ectx.AssertValue(schema);
                ectx.Assert(schema.Feature.HasValue);

                _ectx = ectx;

                _owner = owner;
                InputRoleMappedSchema = schema;

                // A vector containing the output of each tree on a given example.
                var treeValueType = new VectorDataViewType(NumberDataViewType.Single, owner._ensemble.TrainedEnsemble.NumTrees);
                // An indicator vector with length = the total number of leaves in the ensemble, indicating which leaf the example
                // ends up in all the trees in the ensemble.
                var leafIdType = new VectorDataViewType(NumberDataViewType.Single, owner._totalLeafCount);
                // An indicator vector with length = the total number of nodes in the ensemble, indicating the nodes on
                // the paths of the example in all the trees in the ensemble.
                // The total number of nodes in a binary tree is equal to the number of internal nodes + the number of leaf nodes,
                // and it is also equal to the number of children of internal nodes (which is 2 * the number of internal nodes)
                // plus one (since the root node is not a child of any node). So we have #internal + #leaf = 2*(#internal) + 1,
                // which means that #internal = #leaf - 1.
                // Therefore, the number of internal nodes in the ensemble is #leaf - #trees.
                var pathIdType = new VectorDataViewType(NumberDataViewType.Single, owner._totalLeafCount - owner._ensemble.TrainedEnsemble.NumTrees);

                // Start creating output schema with types derived above.
                var schemaBuilder = new DataViewSchema.Builder();

                _treesColumnName = treesColumnName;
                if (treesColumnName != null)
                {
                    // Metadata of tree values.
                    var treeIdMetadataBuilder = new DataViewSchema.Annotations.Builder();
                    treeIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(treeValueType.Size),
                                              (ValueGetter <VBuffer <ReadOnlyMemory <char> > >)owner.GetTreeSlotNames);

                    // Add the column of trees' output values
                    schemaBuilder.AddColumn(treesColumnName, treeValueType, treeIdMetadataBuilder.ToAnnotations());
                }

                _leavesColumnName = leavesColumnName;
                if (leavesColumnName != null)
                {
                    // Metadata of leaf IDs.
                    var leafIdMetadataBuilder = new DataViewSchema.Annotations.Builder();
                    leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(leafIdType.Size),
                                              (ValueGetter <VBuffer <ReadOnlyMemory <char> > >)owner.GetLeafSlotNames);
                    leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true);

                    // Add the column of leaves' IDs where the input example reaches.
                    schemaBuilder.AddColumn(leavesColumnName, leafIdType, leafIdMetadataBuilder.ToAnnotations());
                }

                _pathsColumnName = pathsColumnName;
                if (pathsColumnName != null)
                {
                    // Metadata of path IDs.
                    var pathIdMetadataBuilder = new DataViewSchema.Annotations.Builder();
                    pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(pathIdType.Size),
                                              (ValueGetter <VBuffer <ReadOnlyMemory <char> > >)owner.GetPathSlotNames);
                    pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true);

                    // Add the column of encoded paths which the input example passes.
                    schemaBuilder.AddColumn(pathsColumnName, pathIdType, pathIdMetadataBuilder.ToAnnotations());
                }

                OutputSchema = schemaBuilder.ToSchema();
            }
Ejemplo n.º 9
0
        internal static DataViewSchema GetModelSchema(IExceptionContext ectx, Graph graph, string opType = null)
        {
            var schemaBuilder = new DataViewSchema.Builder();

            foreach (Operation op in graph)
            {
                if (opType != null && opType != op.OpType)
                {
                    continue;
                }

                var tfType = op.OutputType(0);
                // Determine element type in Tensorflow tensor. For example, a vector of floats may get NumberType.R4 here.
                var mlType = Tf2MlNetTypeOrNull(tfType);

                // If the type is not supported in ML.NET then we cannot represent it as a column in an Schema.
                // We also cannot output it with a TensorFlowTransform, so we skip it.
                // Furthermore, operators which have NumOutputs <= 0 needs to be filtered.
                // The 'GetTensorShape' method crashes TensorFlow runtime
                // (https://github.com/dotnet/machinelearning/issues/2156) when the operator has no outputs.
                if (mlType == null || op.NumOutputs <= 0)
                {
                    continue;
                }

                // Construct the final ML.NET type of a Tensorflow variable.
                var tensorShape = op.output.TensorShape.dims;
                var columnType  = new VectorDataViewType(mlType);
                if (!(Utils.Size(tensorShape) == 1 && tensorShape[0] <= 0) &&
                    (Utils.Size(tensorShape) > 0 && tensorShape.Skip(1).All(x => x > 0)))
                {
                    columnType = new VectorDataViewType(mlType, tensorShape[0] > 0 ? tensorShape : tensorShape.Skip(1).ToArray());
                }

                // There can be at most two metadata fields.
                //  1. The first field always presents. Its value is this operator's type. For example,
                //     if an output is produced by an "Softmax" operator, the value of this field should be "Softmax".
                //  2. The second field stores operators whose outputs are consumed by this operator. In other words,
                //     these values are names of some upstream operators which should be evaluated before executing
                //     the current operator. It's possible that one operator doesn't need any input, so this field
                //     can be missing.
                var metadataBuilder = new DataViewSchema.Annotations.Builder();
                // Create the first metadata field.
                metadataBuilder.Add(TensorflowOperatorTypeKind, TextDataViewType.Instance, (ref ReadOnlyMemory <char> value) => value = op.OpType.AsMemory());
                if (op.NumInputs > 0)
                {
                    // Put upstream operators' names to an array (type: VBuffer) of string (type: ReadOnlyMemory<char>).
                    VBuffer <ReadOnlyMemory <char> > upstreamOperatorNames = default;
                    var bufferEditor = VBufferEditor.Create(ref upstreamOperatorNames, op.NumInputs);
                    for (int i = 0; i < op.NumInputs; ++i)
                    {
                        bufferEditor.Values[i] = op.inputs[i].op.name.AsMemory();
                    }
                    upstreamOperatorNames = bufferEditor.Commit(); // Used in metadata's getter.

                    // Create the second metadata field.
                    metadataBuilder.Add(TensorflowUpstreamOperatorsKind, new VectorDataViewType(TextDataViewType.Instance, op.NumInputs),
                                        (ref VBuffer <ReadOnlyMemory <char> > value) => { upstreamOperatorNames.CopyTo(ref value); });
                }

                schemaBuilder.AddColumn(op.name, columnType, metadataBuilder.ToAnnotations());
            }
            return(schemaBuilder.ToSchema());
        }
Ejemplo n.º 10
0
        static void Main(string[] args)
        {
            var mlContext    = new MLContext();
            var trainingData = mlContext.Data.LoadFromTextFile <RawColorData>(@"D:\Axodox\Documents\rgbMapping.csv", ',');


            var pipeline = mlContext.Transforms.Conversion.ConvertType(new[]
            {
                new InputOutputColumnPair("NR0", "R0"),
                new InputOutputColumnPair("NG0", "G0"),
                new InputOutputColumnPair("NB0", "B0"),
                new InputOutputColumnPair("NR1", "R1"),
                new InputOutputColumnPair("NG1", "G1"),
                new InputOutputColumnPair("NB1", "B1")
            }, DataKind.Single)
                           .Append(mlContext.Transforms.Expression("NR0", "NR0 => NR0 / 255", "NR0"))
                           .Append(mlContext.Transforms.Expression("NG0", "NG0 => NG0 / 255", "NG0"))
                           .Append(mlContext.Transforms.Expression("NB0", "NB0 => NB0 / 255", "NB0"))
                           .Append(mlContext.Transforms.Expression("NR1", "NR1 => NR1 / 255", "NR1"))
                           .Append(mlContext.Transforms.Expression("NG1", "NG1 => NG1 / 255", "NG1"))
                           .Append(mlContext.Transforms.Expression("NB1", "NB1 => NB1 / 255", "NB1"))
                           .Append(mlContext.Transforms.CopyColumns("Label", "NB1"))
                           .Append(mlContext.Transforms.Concatenate("Features", "NR0", "NG0", "NB0"))
                           .Append(mlContext.Transforms.SelectColumns("Label", "Features"))
                           .Append(mlContext.Regression.Trainers.LbfgsPoissonRegression());

            var model       = pipeline.Fit(trainingData);
            var predictions = model.Transform(trainingData);

            var metrics = mlContext.Regression.Evaluate(predictions);

            var testValues = new RawColorData[100];

            for (var i = 0; i < testValues.Length; i++)
            {
                testValues[i] = new RawColorData()
                {
                    B0 = (byte)(i / (float)testValues.Length * 255f)
                };
            }
            var testData   = mlContext.Data.LoadFromEnumerable(testValues);
            var testOutput = model.Transform(testData);

            var valuesOut = testOutput.GetColumn <float>("Score").ToArray();

            var inputSchemaBuilder = new DataViewSchema.Builder();

            inputSchemaBuilder.AddColumn("R0", NumberDataViewType.Byte);
            inputSchemaBuilder.AddColumn("G0", NumberDataViewType.Byte);
            inputSchemaBuilder.AddColumn("B0", NumberDataViewType.Byte);
            var inputSchema = inputSchemaBuilder.ToSchema();

            mlContext.Model.Save(model, inputSchema, "test.bin");

            var exportData = mlContext.Data.LoadFromEnumerable(new[] { new RawColorInput() });

            using (var stream = new FileStream("test.onnx", FileMode.Create, FileAccess.Write))
            {
                OnnxExportExtensions.ConvertToOnnx(mlContext.Model, model, exportData, stream);
                stream.Flush();
            }
        }
Ejemplo n.º 11
0
            public DynamicMatrix(object[][] data, IEnumerable <string> columnNames)
            {
                // Check arguments
                if (data is null)
                {
                    throw new ArgumentNullException(nameof(data));
                }
                if (data.Length == 0)
                {
                    throw new ArgumentException("Input data must contains at least 1 row");
                }
                if (columnNames is null)
                {
                    throw new ArgumentNullException(nameof(columnNames));
                }

                // Determine data type of each column from the first row
                var collectedColumnNames = columnNames.ToArray();
                var builder = new DataViewSchema.Builder();

                for (var i = 0; i < collectedColumnNames.Length; i++)
                {
                    var columnName = collectedColumnNames[i];
                    var firstValue = data[0][i];

                    DataViewType type;
                    if (firstValue is float)
                    {
                        type = NumberDataViewType.Single;
                    }
                    else if (firstValue is DateTime)
                    {
                        type = DateTimeDataViewType.Instance;
                    }
                    else if (firstValue is string)
                    {
                        type = TextDataViewType.Instance; // This is not for String but for ReadOnlyMemory<char>
                    }
                    else
                    {
                        throw new ArgumentException($"Unsupported type of value detected: {firstValue.GetType()}");
                    }
                    builder.AddColumn(columnName, type);
                }
                Schema = builder.ToSchema();

                // Reference all values ensuring its type
                var rows = new List <object[]>();

                for (var i = 0; i < data.Length; i++)
                {
                    var row = data[i].ToArray(); // Shallow copy this row so that we can safely swap its elements

                    for (var j = 0; j < row.Length; j++)
                    {
                        if (Schema[j].Type == TextDataViewType.Instance)
                        {
                            row[j] = new ReadOnlyMemory <char>(((string)row[j]).ToCharArray());
                        }
                        //TODO: We should check type consistency here for other data types
                    }

                    rows.Add(row);
                }
                _data = rows.ToArray();
            }