예제 #1
0
        public void ConcatWithAliases()
        {
            string dataPath = GetDataPath("adult.tiny.with-schema.txt");

            var source = new MultiFileSource(dataPath);
            var loader = new TextLoader(ML, new TextLoader.Options
            {
                Columns = new[] {
                    new TextLoader.Column("float1", DataKind.Single, 9),
                    new TextLoader.Column("float4", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }),
                    new TextLoader.Column("vfloat", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12, null)
                                                                             {
                                                                                 AutoEnd = false, VariableEnd = true
                                                                             } })
                },
                Separator = "\t",
                HasHeader = true
            }, new MultiFileSource(dataPath));
            var data = loader.Load(source);

            DataViewType GetType(DataViewSchema schema, string name)
            {
                Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'");
                return(schema[cIdx].Type);
            }

            data = ML.Data.TakeRows(data, 10);

            var concater = new ColumnConcatenatingTransformer(ML,
                                                              new ColumnConcatenatingTransformer.ColumnOptions("f2", new[] { ("float1", "FLOAT1"), ("float1", "FLOAT2") }),
예제 #2
0
        public ITransformer Fit(IDataView input)
        {
            var h = _host;

            h.CheckValue(input, nameof(input));

            var tparams = new TransformApplierParams(this);

            string[]      textCols       = _inputColumns;
            string[]      wordTokCols    = null;
            string[]      charTokCols    = null;
            string        wordFeatureCol = null;
            string        charFeatureCol = null;
            List <string> tempCols       = new List <string>();
            IDataView     view           = input;

            if (tparams.NeedInitialSourceColumnConcatTransform && textCols.Length > 1)
            {
                var srcCols = textCols;
                textCols = new[] { GenerateColumnName(input.Schema, OutputColumn, "InitialConcat") };
                tempCols.Add(textCols[0]);
                view = new ColumnConcatenatingTransformer(h, textCols[0], srcCols).Transform(view);
            }

            if (tparams.NeedsNormalizeTransform)
            {
                var      xfCols  = new (string input, string output)[textCols.Length];
예제 #3
0
        public void ConcatWithAliases()
        {
            string dataPath = GetDataPath("adult.test");

            var source = new MultiFileSource(dataPath);
            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 0),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }),
                    new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10, null)
                                                                         {
                                                                             AutoEnd = false, VariableEnd = true
                                                                         } })
                },
                Separator = ",",
                HasHeader = true
            }, new MultiFileSource(dataPath));
            var data = loader.Read(source);

            ColumnType GetType(Schema schema, string name)
            {
                Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'");
                return(schema.GetColumnType(cIdx));
            }

            data = TakeFilter.Create(Env, data, 10);

            var concater = new ColumnConcatenatingTransformer(Env,
                                                              new ColumnConcatenatingTransformer.ColumnInfo("f2", new[] { ("float1", "FLOAT1"), ("float1", "FLOAT2") }),
        public StocasticDualCoordianteAscent(double[][] inputs, double[] labels)
        {
            IDataView data_in = context.Data.LoadFromEnumerable <_data>(GetSampleData(inputs, labels));

            DataOperationsCatalog.TrainTestData partitions = context.Data.TrainTestSplit(data_in);

            Microsoft.ML.Transforms.ColumnConcatenatingEstimator pipeline = context.Transforms.Concatenate("Features", nameof(_data.Features));

            pipeline.AppendCacheCheckpoint(context);

            pipeline.Append(context.MulticlassClassification.Trainers.SdcaNonCalibrated());

            ColumnConcatenatingTransformer model = pipeline.Fit(partitions.TrainSet);

            //var engine = ModelOperationsCatalog.CreatePredictionEngine<Digit, DigitPrediction>(model);
            Console.WriteLine("Evaluating model....");
            IDataView predictions = model.Transform(partitions.TestSet);

            // evaluate the predictions
            MulticlassClassificationMetrics metrics = context.MulticlassClassification.Evaluate(predictions);

            // show evaluation metrics
            Console.WriteLine($"Evaluation metrics");
            Console.WriteLine($"    MicroAccuracy:    {metrics.MicroAccuracy:0.###}");
            Console.WriteLine($"    MacroAccuracy:    {metrics.MacroAccuracy:0.###}");
            Console.WriteLine($"    LogLoss:          {metrics.LogLoss:#.###}");
            Console.WriteLine($"    LogLossReduction: {metrics.LogLossReduction:#.###}");
            Console.WriteLine();
        }
예제 #5
0
            public Mapper(ColumnConcatenatingTransformer parent, DataViewSchema inputSchema) :
                base(Contracts.CheckRef(parent, nameof(parent)).Host.Register(nameof(Mapper)), inputSchema, parent)
            {
                _parent = parent;

                _columns = new BoundColumn[_parent._columns.Length];
                for (int i = 0; i < _parent._columns.Length; i++)
                {
                    _columns[i] = MakeColumn(inputSchema, i);
                }
            }
        public static CommonOutputs.TransformOutput PrepareFeatures(IHostEnvironment env, FeatureCombinerInput input)
        {
            const string featureCombiner = "FeatureCombiner";

            Contracts.CheckValue(env, nameof(env));
            var host = env.Register(featureCombiner);

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);
            using (var ch = host.Start(featureCombiner))
            {
                var viewTrain = input.Data;
                var rms       = new RoleMappedSchema(viewTrain.Schema, input.GetRoles());
                var feats     = rms.GetColumns(RoleMappedSchema.ColumnRole.Feature);
                if (Utils.Size(feats) == 0)
                {
                    throw ch.Except("No feature columns specified");
                }
                var featNames   = new HashSet <string>();
                var concatNames = new List <KeyValuePair <string, string> >();
                List <TypeConvertingEstimator.ColumnOptions> cvt;
                int errCount;
                var ktv = ConvertFeatures(feats.ToArray(), featNames, concatNames, ch, out cvt, out errCount);
                Contracts.Assert(featNames.Count > 0);
                Contracts.Assert(concatNames.Count == featNames.Count);
                if (errCount > 0)
                {
                    throw ch.Except("Encountered {0} invalid training column(s)", errCount);
                }

                viewTrain = ApplyConvert(cvt, viewTrain, host);
                viewTrain = ApplyKeyToVec(ktv, viewTrain, host);

                // REVIEW: What about column name conflicts? Eg, what if someone uses the group id column
                // (a key type) as a feature column. We convert that column to a vector so it is no longer valid
                // as a group id. That's just one example - you get the idea.
                string nameFeat = DefaultColumnNames.Features;
                viewTrain = ColumnConcatenatingTransformer.Create(host,
                                                                  new ColumnConcatenatingTransformer.TaggedOptions()
                {
                    Columns =
                        new[] { new ColumnConcatenatingTransformer.TaggedColumn()
                                {
                                    Name = nameFeat, Source = concatNames.ToArray()
                                } }
                },
                                                                  viewTrain);
                return(new CommonOutputs.TransformOutput {
                    Model = new TransformModelImpl(env, viewTrain, input.Data), OutputData = viewTrain
                });
            }
        }
예제 #7
0
        public static CommonOutputs.TransformOutput ConcatColumns(IHostEnvironment env, ColumnConcatenatingTransformer.Arguments input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("ConcatColumns");

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);

            var xf = ColumnConcatenatingTransformer.Create(env, input, input.Data);

            return(new CommonOutputs.TransformOutput {
                Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf
            });
        }
예제 #8
0
            public Mapper(ColumnConcatenatingTransformer parent, Schema inputSchema)
            {
                Contracts.AssertValue(parent);
                Contracts.AssertValue(inputSchema);
                _host        = parent._host.Register(nameof(Mapper));
                _parent      = parent;
                _inputSchema = inputSchema;

                _columns = new BoundColumn[_parent._columns.Length];
                for (int i = 0; i < _parent._columns.Length; i++)
                {
                    _columns[i] = MakeColumn(inputSchema, i);
                }
            }
        public void TrainAndPredictIrisModelUsingDirectInstantiationTest()
        {
            string dataPath     = GetDataPath("iris.txt");
            string testDataPath = dataPath;

            using (var env = new ConsoleEnvironment(seed: 1, conc: 1))
            {
                // Pipeline
                var loader = TextLoader.ReadFile(env,
                                                 new TextLoader.Arguments()
                {
                    HasHeader = false,
                    Column    = new[]
                    {
                        new TextLoader.Column("Label", DataKind.R4, 0),
                        new TextLoader.Column("SepalLength", DataKind.R4, 1),
                        new TextLoader.Column("SepalWidth", DataKind.R4, 2),
                        new TextLoader.Column("PetalLength", DataKind.R4, 3),
                        new TextLoader.Column("PetalWidth", DataKind.R4, 4)
                    }
                }, new MultiFileSource(dataPath));

                IDataView pipeline = new ColumnConcatenatingTransformer(env, "Features",
                                                                        "SepalLength", "SepalWidth", "PetalLength", "PetalWidth").Transform(loader);

                // NormalizingEstimator is not automatically added though the trainer has 'NormalizeFeatures' On/Auto
                pipeline = NormalizeTransform.CreateMinMaxNormalizer(env, pipeline, "Features");

                // Train
                var trainer = new SdcaMultiClassTrainer(env, "Label", "Features", advancedSettings: (s) => s.NumThreads = 1);

                // Explicity adding CacheDataView since caching is not working though trainer has 'Caching' On/Auto
                var cached     = new CacheDataView(env, pipeline, prefetch: null);
                var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features");
                var pred       = trainer.Train(trainRoles);

                // Get scorer and evaluate the predictions from test data
                IDataScorerTransform testDataScorer = GetScorer(env, pipeline, pred, testDataPath);
                var metrics = Evaluate(env, testDataScorer);
                CompareMatrics(metrics);

                // Create prediction engine and test predictions
                var model = env.CreatePredictionEngine <IrisData, IrisPrediction>(testDataScorer);
                ComparePredictions(model);

                // Get feature importance i.e. weight vector
                var summary = ((MulticlassLogisticRegressionPredictor)pred).GetSummaryInKeyValuePairs(trainRoles.Schema);
                Assert.Equal(7.757864, Convert.ToDouble(summary[0].Value), 5);
            }
        }
예제 #10
0
        public static CommonOutputs.TransformOutput ConcatColumns(IHostEnvironment env, ColumnCopyingTransformer.Options input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("PrefixConcatColumns");

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);

            // Get all column names with preserving order.
            var colNames = new List <string>(input.Data.Schema.Count);

            for (int i = 0; i < input.Data.Schema.Count; i++)
            {
                colNames.Add(input.Data.Schema[i].Name);
            }

            // Iterate through input options, find matching source columns, create new input options
            var inputOptions = new ColumnConcatenatingTransformer.Options()
            {
                Data = input.Data
            };
            var columns = new List <ColumnConcatenatingTransformer.Column>(input.Columns.Length);

            foreach (var col in input.Columns)
            {
                var newCol = new ColumnConcatenatingTransformer.Column();
                newCol.Name = col.Name;
                var prefix = col.Source;
                newCol.Source = colNames.Where(x => x.StartsWith(prefix, StringComparison.InvariantCulture)).ToArray();
                if (newCol.Source.Length == 0)
                {
                    throw new ArgumentOutOfRangeException("No matching columns found for prefix: " + prefix);
                }

                columns.Add(newCol);
            }
            inputOptions.Columns = columns.ToArray();

            var xf = ColumnConcatenatingTransformer.Create(env, inputOptions, inputOptions.Data);

            return(new CommonOutputs.TransformOutput {
                Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf
            });
        }
예제 #11
0
        public static IDataView ApplyConcatOnSources(IHostEnvironment env, ManyToOneColumn[] columns, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(columns, nameof(columns));
            env.CheckValue(input, nameof(input));

            IDataView view       = input;
            var       concatCols = new List <ColumnConcatenatingTransformer.Column>();

            foreach (var col in columns)
            {
                env.CheckUserArg(col != null, nameof(WordBagBuildingTransformer.Arguments.Column));
                env.CheckUserArg(!string.IsNullOrWhiteSpace(col.Name), nameof(col.Name));
                env.CheckUserArg(Utils.Size(col.Source) > 0, nameof(col.Source));
                env.CheckUserArg(col.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(col.Source));

                if (col.Source.Length > 1)
                {
                    concatCols.Add(
                        new ColumnConcatenatingTransformer.Column
                    {
                        Source = col.Source,
                        Name   = col.Name
                    });
                }
            }
            if (concatCols.Count > 0)
            {
                var concatArgs = new ColumnConcatenatingTransformer.Arguments {
                    Column = concatCols.ToArray()
                };
                return(ColumnConcatenatingTransformer.Create(env, concatArgs, view));
            }

            return(view);
        }
        public ParameterMixingCalibratedPredictor TrainKMeansAndLR()
        {
            using (var env = new ConsoleEnvironment(seed: 1, verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
            {
                // Pipeline
                var loader = TextLoader.ReadFile(env,
                                                 new TextLoader.Arguments()
                {
                    HasHeader = true,
                    Separator = ",",
                    Column    = new[] {
                        new TextLoader.Column("Label", DataKind.R4, 14),
                        new TextLoader.Column("CatFeatures", DataKind.TX,
                                              new [] {
                            new TextLoader.Range()
                            {
                                Min = 1, Max = 1
                            },
                            new TextLoader.Range()
                            {
                                Min = 3, Max = 3
                            },
                            new TextLoader.Range()
                            {
                                Min = 5, Max = 9
                            },
                            new TextLoader.Range()
                            {
                                Min = 13, Max = 13
                            }
                        }),
                        new TextLoader.Column("NumFeatures", DataKind.R4,
                                              new [] {
                            new TextLoader.Range()
                            {
                                Min = 0, Max = 0
                            },
                            new TextLoader.Range()
                            {
                                Min = 2, Max = 2
                            },
                            new TextLoader.Range()
                            {
                                Min = 4, Max = 4
                            },
                            new TextLoader.Range()
                            {
                                Min = 10, Max = 12
                            }
                        })
                    }
                }, new MultiFileSource(_dataPath));

                IDataView trans = new OneHotEncodingEstimator(env, "CatFeatures").Fit(loader).Transform(loader);

                trans = NormalizeTransform.CreateMinMaxNormalizer(env, trans, "NumFeatures");
                trans = new ColumnConcatenatingTransformer(env, "Features", "NumFeatures", "CatFeatures").Transform(trans);
                trans = TrainAndScoreTransformer.Create(env, new TrainAndScoreTransformer.Arguments
                {
                    Trainer = ComponentFactoryUtils.CreateFromFunction(host =>
                                                                       new KMeansPlusPlusTrainer(host, "Features", advancedSettings: s =>
                    {
                        s.K = 100;
                    })),
                    FeatureColumn = "Features"
                }, trans);
                trans = new ColumnConcatenatingTransformer(env, "Features", "Features", "Score").Transform(trans);

                // Train
                var trainer    = new LogisticRegression(env, "Label", "Features", advancedSettings: args => { args.EnforceNonNegativity = true; args.OptTol = 1e-3f; });
                var trainRoles = new RoleMappedData(trans, label: "Label", feature: "Features");
                return(trainer.Train(trainRoles));
            }
        }
        /// Factory method for SignatureDataTransform.
        internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register("Categorical");

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns));

            var replaceCols          = new List <MissingValueReplacingEstimator.ColumnInfo>();
            var naIndicatorCols      = new List <MissingValueIndicatorTransformer.Column>();
            var naConvCols           = new List <TypeConvertingEstimator.ColumnInfo>();
            var concatCols           = new List <ColumnConcatenatingTransformer.TaggedColumn>();
            var dropCols             = new List <string>();
            var tmpIsMissingColNames = input.Schema.GetTempColumnNames(options.Columns.Length, "IsMissing");
            var tmpReplaceColNames   = input.Schema.GetTempColumnNames(options.Columns.Length, "Replace");

            for (int i = 0; i < options.Columns.Length; i++)
            {
                var column = options.Columns[i];

                var addInd = column.ConcatIndicator ?? options.Concat;
                if (!addInd)
                {
                    replaceCols.Add(new MissingValueReplacingEstimator.ColumnInfo(column.Name, column.Source,
                                                                                  (MissingValueReplacingEstimator.ColumnInfo.ReplacementMode)(column.Kind ?? options.ReplaceWith), column.ImputeBySlot ?? options.ImputeBySlot));
                    continue;
                }

                // Check that the indicator column has a type that can be converted to the NAReplaceTransform output type,
                // so that they can be concatenated.
                if (!input.Schema.TryGetColumnIndex(column.Source, out int inputCol))
                {
                    throw h.Except("Column '{0}' does not exist", column.Source);
                }
                var replaceType     = input.Schema[inputCol].Type;
                var replaceItemType = replaceType.GetItemType();
                if (!Data.Conversion.Conversions.Instance.TryGetStandardConversion(BooleanDataViewType.Instance, replaceItemType, out Delegate conv, out bool identity))
                {
                    throw h.Except("Cannot concatenate indicator column of type '{0}' to input column of type '{1}'",
                                   BooleanDataViewType.Instance, replaceItemType);
                }

                // Find a temporary name for the NAReplaceTransform and NAIndicatorTransform output columns.
                var tmpIsMissingColName   = tmpIsMissingColNames[i];
                var tmpReplacementColName = tmpReplaceColNames[i];

                // Add an NAHandleTransform column.
                naIndicatorCols.Add(new MissingValueIndicatorTransformer.Column()
                {
                    Name = tmpIsMissingColName, Source = column.Source
                });

                // Add a ConvertTransform column if necessary.
                if (!identity)
                {
                    if (!replaceItemType.RawType.TryGetDataKind(out DataKind replaceItemTypeKind))
                    {
                        throw h.Except("Cannot get a DataKind for type '{0}'", replaceItemType.RawType);
                    }
                    naConvCols.Add(new TypeConvertingEstimator.ColumnInfo(tmpIsMissingColName, replaceItemTypeKind, tmpIsMissingColName));
                }

                // Add the NAReplaceTransform column.
                replaceCols.Add(new MissingValueReplacingEstimator.ColumnInfo(tmpReplacementColName, column.Source,
                                                                              (MissingValueReplacingEstimator.ColumnInfo.ReplacementMode)(column.Kind ?? options.ReplaceWith), column.ImputeBySlot ?? options.ImputeBySlot));

                // Add the ConcatTransform column.
                if (replaceType is VectorType)
                {
                    concatCols.Add(new ColumnConcatenatingTransformer.TaggedColumn()
                    {
                        Name   = column.Name,
                        Source = new[] {
                            new KeyValuePair <string, string>(tmpReplacementColName, tmpReplacementColName),
                            new KeyValuePair <string, string>("IsMissing", tmpIsMissingColName)
                        }
                    });
                }
                else
                {
                    concatCols.Add(new ColumnConcatenatingTransformer.TaggedColumn()
                    {
                        Name   = column.Name,
                        Source = new[]
                        {
                            new KeyValuePair <string, string>(column.Source, tmpReplacementColName),
                            new KeyValuePair <string, string>(string.Format("IsMissing.{0}", column.Source), tmpIsMissingColName),
                        }
                    });
                }

                // Add the temp column to the list of columns to drop at the end.
                dropCols.Add(tmpIsMissingColName);
                dropCols.Add(tmpReplacementColName);
            }

            IDataTransform output = null;

            // Create the indicator columns.
            if (naIndicatorCols.Count > 0)
            {
                output = MissingValueIndicatorTransformer.Create(h, new MissingValueIndicatorTransformer.Options()
                {
                    Columns = naIndicatorCols.ToArray()
                }, input);
            }

            // Convert the indicator columns to the correct type so that they can be concatenated to the NAReplace outputs.
            if (naConvCols.Count > 0)
            {
                h.AssertValue(output);
                //REVIEW: all this need to be converted to estimatorChain as soon as we done with dropcolumns.
                output = new TypeConvertingTransformer(h, naConvCols.ToArray()).Transform(output) as IDataTransform;
            }
            // Create the NAReplace transform.
            output = MissingValueReplacingTransformer.Create(env, output ?? input, replaceCols.ToArray());

            // Concat the NAReplaceTransform output and the NAIndicatorTransform output.
            if (naIndicatorCols.Count > 0)
            {
                output = ColumnConcatenatingTransformer.Create(h, new ColumnConcatenatingTransformer.TaggedOptions()
                {
                    Columns = concatCols.ToArray()
                }, output);
            }

            // Finally, drop the temporary indicator columns.
            if (dropCols.Count > 0)
            {
                output = ColumnSelectingTransformer.CreateDrop(h, output, dropCols.ToArray()) as IDataTransform;
            }

            return(output);
        }