Beispiel #1
0
        public void KeyToValueWorkout()
        {
            string dataPath = GetDataPath("iris.txt");

            var reader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[]
                {
                    new TextLoader.Column("ScalarString", DataKind.TX, 1),
                    new TextLoader.Column("VectorString", DataKind.TX, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column
                    {
                        Name     = "BareKey",
                        Source   = new[] { new TextLoader.Range(0) },
                        Type     = DataKind.U4,
                        KeyRange = new KeyRange(0, 5),
                    }
                }
            });

            var data = reader.Read(dataPath);

            data = new ValueToKeyMappingEstimator(Env, new[] {
                new ValueToKeyMappingTransformer.ColumnInfo("ScalarString", "A"),
                new ValueToKeyMappingTransformer.ColumnInfo("VectorString", "B")
            }).Fit(data).Transform(data);

            var badData1 = new ColumnsCopyingTransformer(Env, ("BareKey", "A")).Transform(data);
            var badData2 = new ColumnsCopyingTransformer(Env, ("VectorString", "B")).Transform(data);

            var est = new KeyToValueMappingEstimator(Env, ("A", "A_back"), ("B", "B_back"));

            TestEstimatorCore(est, data, invalidInput: badData1);
            TestEstimatorCore(est, data, invalidInput: badData2);


            var outputPath = GetOutputPath("KeyToValue", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = est.Fit(data).Transform(data);
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("KeyToValue", "featurized.tsv");
            Done();
        }
Beispiel #2
0
        /// <summary>
        /// If any column names in <param name="colNames" /> are present in <param name="input" />, this
        /// method will create a transform that copies them to temporary columns. It will populate <param name="hiddenNames" />
        /// with an array of string pairs containing the original name and the generated temporary column name, respectively.
        /// </summary>
        /// <param name="env"></param>
        private static IDataView AliasIfNeeded(IHostEnvironment env, IDataView input, string[] colNames, out KeyValuePair <string, string>[] hiddenNames)
        {
            hiddenNames = null;
            var toHide = new List <string>(colNames.Length);

            foreach (var name in colNames)
            {
                int discard;
                if (input.Schema.TryGetColumnIndex(name, out discard))
                {
                    toHide.Add(name);
                }
            }

            if (toHide.Count == 0)
            {
                return(input);
            }

            hiddenNames = toHide.Select(colName =>
                                        new KeyValuePair <string, string>(colName, input.Schema.GetTempColumnName(colName))).ToArray();
            return(ColumnsCopyingTransformer.Create(env, new ColumnsCopyingTransformer.Arguments()
            {
                Column = hiddenNames.Select(pair => new ColumnsCopyingTransformer.Column()
                {
                    Name = pair.Value, Source = pair.Key
                }).ToArray()
            }, input));
        }
Beispiel #3
0
        public static CommonOutputs.TransformOutput RenameBinaryPredictionScoreColumns(IHostEnvironment env,
                                                                                       RenameBinaryPredictionScoreColumnsInput input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("ScoreModel");

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);

            if (input.PredictorModel.Predictor.PredictionKind == PredictionKind.BinaryClassification)
            {
                ColumnType labelType;
                var        labelNames = input.PredictorModel.GetLabelInfo(host, out labelType);
                if (labelNames != null && labelNames.Length == 2)
                {
                    var positiveClass = labelNames[1];

                    // Rename all the score columns.
                    int colMax;
                    var maxScoreId = input.Data.Schema.GetMaxMetadataKind(out colMax, MetadataUtils.Kinds.ScoreColumnSetId);
                    var copyCols   = new List <(string Source, string Name)>();
                    for (int i = 0; i < input.Data.Schema.ColumnCount; i++)
                    {
                        if (input.Data.Schema.IsHidden(i))
                        {
                            continue;
                        }
                        if (!ShouldAddColumn(input.Data.Schema, i, null, maxScoreId))
                        {
                            continue;
                        }
                        // Do not rename the PredictedLabel column.
                        ReadOnlyMemory <char> tmp = default;
                        if (input.Data.Schema.TryGetMetadata(TextType.Instance, MetadataUtils.Kinds.ScoreValueKind, i,
                                                             ref tmp) &&
                            ReadOnlyMemoryUtils.EqualsStr(MetadataUtils.Const.ScoreValueKind.PredictedLabel, tmp))
                        {
                            continue;
                        }
                        var source = input.Data.Schema.GetColumnName(i);
                        var name   = source + "." + positiveClass;
                        copyCols.Add((source, name));
                    }

                    var copyColumn = new ColumnsCopyingTransformer(env, copyCols.ToArray()).Transform(input.Data);
                    var dropColumn = ColumnSelectingTransformer.CreateDrop(env, copyColumn, copyCols.Select(c => c.Source).ToArray());
                    return(new CommonOutputs.TransformOutput {
                        Model = new TransformModel(env, dropColumn, input.Data), OutputData = dropColumn
                    });
                }
            }

            var newView = NopTransform.CreateIfNeeded(env, input.Data);

            return(new CommonOutputs.TransformOutput {
                Model = new TransformModel(env, newView, input.Data), OutputData = newView
            });
        }
Beispiel #4
0
        public static CommonOutputs.TransformOutput CopyColumns(IHostEnvironment env, ColumnsCopyingTransformer.Arguments input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("CopyColumns");

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);
            var xf = ColumnsCopyingTransformer.Create(env, input, input.Data);

            return(new CommonOutputs.TransformOutput {
                Model = new TransformModel(env, xf, input.Data), OutputData = xf
            });
        }
Beispiel #5
0
        private IDataView WrapPerInstance(RoleMappedData perInst)
        {
            var idv = perInst.Data;

            // Make a list of column names that Maml outputs as part of the per-instance data view, and then wrap
            // the per-instance data computed by the evaluator in a SelectColumnsTransform.
            var cols       = new List <(string Source, string Name)>();
            var colsToKeep = new List <string>();

            // If perInst is the result of cross-validation and contains a fold Id column, include it.
            int foldCol;

            if (perInst.Schema.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.FoldIndex, out foldCol))
            {
                colsToKeep.Add(MetricKinds.ColumnNames.FoldIndex);
            }

            // Maml always outputs a name column, if it doesn't exist add a GenerateNumberTransform.
            if (perInst.Schema.Name == null)
            {
                var args = new GenerateNumberTransform.Arguments();
                args.Column = new[] { new GenerateNumberTransform.Column()
                                      {
                                          Name = "Instance"
                                      } };
                args.UseCounter = true;
                idv             = new GenerateNumberTransform(Host, args, idv);
                colsToKeep.Add("Instance");
            }
            else
            {
                cols.Add((perInst.Schema.Name.Name, "Instance"));
                colsToKeep.Add("Instance");
            }

            // Maml outputs the weight column if it exists.
            if (perInst.Schema.Weight != null)
            {
                colsToKeep.Add(perInst.Schema.Weight.Name);
            }

            // Get the other columns from the evaluator.
            foreach (var col in GetPerInstanceColumnsToSave(perInst.Schema))
            {
                colsToKeep.Add(col);
            }

            idv = new ColumnsCopyingTransformer(Host, cols.ToArray()).Transform(idv);
            idv = ColumnSelectingTransformer.CreateKeep(Host, idv, colsToKeep.ToArray());
            return(GetPerInstanceMetricsCore(idv, perInst.Schema));
        }
Beispiel #6
0
        private static IDataView UnaliasIfNeeded(IHostEnvironment env, IDataView input, KeyValuePair <string, string>[] hiddenNames)
        {
            if (Utils.Size(hiddenNames) == 0)
            {
                return(input);
            }

            input = ColumnsCopyingTransformer.Create(env, new ColumnsCopyingTransformer.Arguments()
            {
                Column = hiddenNames.Select(pair => new ColumnsCopyingTransformer.Column()
                {
                    Name = pair.Key, Source = pair.Value
                }).ToArray()
            }, input);

            return(ColumnSelectingTransformer.CreateDrop(env, input, hiddenNames.Select(pair => pair.Value).ToArray()));
        }
        public void NormalizerWorkout()
        {
            string dataPath = GetDataPath(TestDatasets.iris.trainFilename);

            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 1),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column("double1", DataKind.R8, 1),
                    new TextLoader.Column("double4", DataKind.R8, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column("int1", DataKind.I4, 0),
                    new TextLoader.Column("float0", DataKind.R4, new[] { new TextLoader.Range {
                                                                             Min = 1, VariableEnd = true
                                                                         } }),
                },
                HasHeader = true
            }, new MultiFileSource(dataPath));

            var est = new NormalizingEstimator(Env,
                                               new NormalizingEstimator.MinMaxColumn("float1"),
                                               new NormalizingEstimator.MinMaxColumn("float4"),
                                               new NormalizingEstimator.MinMaxColumn("double1"),
                                               new NormalizingEstimator.MinMaxColumn("double4"),
                                               new NormalizingEstimator.BinningColumn("float1", "float1bin"),
                                               new NormalizingEstimator.BinningColumn("float4", "float4bin"),
                                               new NormalizingEstimator.BinningColumn("double1", "double1bin"),
                                               new NormalizingEstimator.BinningColumn("double4", "double4bin"),
                                               new NormalizingEstimator.MeanVarColumn("float1", "float1mv"),
                                               new NormalizingEstimator.MeanVarColumn("float4", "float4mv"),
                                               new NormalizingEstimator.MeanVarColumn("double1", "double1mv"),
                                               new NormalizingEstimator.MeanVarColumn("double4", "double4mv"),
                                               new NormalizingEstimator.LogMeanVarColumn("float1", "float1lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("float4", "float4lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("double1", "double1lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("double4", "double4lmv"));

            var data = loader.Read(dataPath);

            var badData1 = new ColumnsCopyingTransformer(Env, ("int1", "float1")).Transform(data);
            var badData2 = new ColumnsCopyingTransformer(Env, ("float0", "float4")).Transform(data);

            TestEstimatorCore(est, data, null, badData1);
            TestEstimatorCore(est, data, null, badData2);

            var outputPath = GetOutputPath("NormalizerEstimator", "normalized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                using (var fs = File.Create(outputPath))
                {
                    var dataView = ColumnSelectingTransformer.CreateDrop(Env, est.Fit(data).Transform(data), "float0");
                    DataSaverUtils.SaveDataView(ch, saver, dataView, fs, keepHidden: true);
                }
            }

            CheckEquality("NormalizerEstimator", "normalized.tsv");

            Done();
        }
Beispiel #8
0
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckNonWhiteSpace(args.Source, nameof(args.Source));

            if (string.IsNullOrWhiteSpace(args.Name))
            {
                args.Name = args.Source;
            }

            var file = Utils.FindExistentFileOrNull("pretrained.model", "Sentiment", assemblyForBasePath: typeof(SentimentAnalyzingTransformer));

            if (file == null)
            {
                throw h.Except("resourcePath", "Missing resource for SentimentAnalyzingTransform.");
            }

            // The logic below ensures that any columns in our input IDataView that conflict
            // with column names known to be used in the pretrained model transform pipeline we're
            // loading are aliased to temporary column names before we apply the pipeline and then
            // renamed back to their original names after. We do this to ensure the pretrained model
            // doesn't shadow or replace columns we aren't expecting it to.

            // 1. Alias any column in the input IDataView that is known to appear to the pretrained
            // model into a temporary column so that we can restore them after the pretrained model
            // is added to the pipeline.
            KeyValuePair <string, string>[] aliased;
            input = AliasIfNeeded(env, input, _modelIntermediateColumnNames, out aliased);

            // 2. Copy source column to a column with the name expected by the pretrained model featurization
            // transform pipeline.
            var copyTransformer = new ColumnsCopyingTransformer(env, (args.Source, ModelInputColumnName));

            input = copyTransformer.Transform(input);

            // 3. Apply the pretrained model and its featurization transform pipeline.
            input = LoadTransforms(env, input, file);

            // 4. Copy the output column from the pretrained model to a temporary column.
            var scoreTempName = input.Schema.GetTempColumnName("sa_out");

            copyTransformer = new ColumnsCopyingTransformer(env, (ModelScoreColumnName, scoreTempName));
            input           = copyTransformer.Transform(input);

            // 5. Drop all the columns created by the pretrained model, including the expected input column
            // and the output column, which we have copied to a temporary column in (4).
            input = ColumnSelectingTransformer.CreateDrop(env, input, _modelIntermediateColumnNames);

            // 6. Unalias all the original columns that were originally present in the IDataView, but may have
            // been shadowed by column names in the pretrained model. This method will also drop all the temporary
            // columns that were created for them in (1).
            input = UnaliasIfNeeded(env, input, aliased);

            // 7. Copy the temporary column with the score we created in (4) to a column with the user-specified destination name.
            copyTransformer = new ColumnsCopyingTransformer(env, (scoreTempName, args.Name));
            input           = copyTransformer.Transform(input);

            // 8. Drop the temporary column with the score created in (4).
            return(ColumnSelectingTransformer.CreateDrop(env, input, scoreTempName));
        }