public void ConcatWithAliases() { string dataPath = GetDataPath("adult.tiny.with-schema.txt"); var source = new MultiFileSource(dataPath); var loader = new TextLoader(ML, new TextLoader.Options { Columns = new[] { new TextLoader.Column("float1", DataKind.Single, 9), new TextLoader.Column("float4", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }), new TextLoader.Column("vfloat", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12, null) { AutoEnd = false, VariableEnd = true } }) }, Separator = "\t", HasHeader = true }, new MultiFileSource(dataPath)); var data = loader.Load(source); DataViewType GetType(DataViewSchema schema, string name) { Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'"); return(schema[cIdx].Type); } data = ML.Data.TakeRows(data, 10); var concater = new ColumnConcatenatingTransformer(ML, new ColumnConcatenatingTransformer.ColumnOptions("f2", new[] { ("float1", "FLOAT1"), ("float1", "FLOAT2") }),
public ITransformer Fit(IDataView input) { var h = _host; h.CheckValue(input, nameof(input)); var tparams = new TransformApplierParams(this); string[] textCols = _inputColumns; string[] wordTokCols = null; string[] charTokCols = null; string wordFeatureCol = null; string charFeatureCol = null; List <string> tempCols = new List <string>(); IDataView view = input; if (tparams.NeedInitialSourceColumnConcatTransform && textCols.Length > 1) { var srcCols = textCols; textCols = new[] { GenerateColumnName(input.Schema, OutputColumn, "InitialConcat") }; tempCols.Add(textCols[0]); view = new ColumnConcatenatingTransformer(h, textCols[0], srcCols).Transform(view); } if (tparams.NeedsNormalizeTransform) { var xfCols = new (string input, string output)[textCols.Length];
public void ConcatWithAliases() { string dataPath = GetDataPath("adult.test"); var source = new MultiFileSource(dataPath); var loader = new TextLoader(Env, new TextLoader.Arguments { Column = new[] { new TextLoader.Column("float1", DataKind.R4, 0), new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }), new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10, null) { AutoEnd = false, VariableEnd = true } }) }, Separator = ",", HasHeader = true }, new MultiFileSource(dataPath)); var data = loader.Read(source); ColumnType GetType(Schema schema, string name) { Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'"); return(schema.GetColumnType(cIdx)); } data = TakeFilter.Create(Env, data, 10); var concater = new ColumnConcatenatingTransformer(Env, new ColumnConcatenatingTransformer.ColumnInfo("f2", new[] { ("float1", "FLOAT1"), ("float1", "FLOAT2") }),
public StocasticDualCoordianteAscent(double[][] inputs, double[] labels) { IDataView data_in = context.Data.LoadFromEnumerable <_data>(GetSampleData(inputs, labels)); DataOperationsCatalog.TrainTestData partitions = context.Data.TrainTestSplit(data_in); Microsoft.ML.Transforms.ColumnConcatenatingEstimator pipeline = context.Transforms.Concatenate("Features", nameof(_data.Features)); pipeline.AppendCacheCheckpoint(context); pipeline.Append(context.MulticlassClassification.Trainers.SdcaNonCalibrated()); ColumnConcatenatingTransformer model = pipeline.Fit(partitions.TrainSet); //var engine = ModelOperationsCatalog.CreatePredictionEngine<Digit, DigitPrediction>(model); Console.WriteLine("Evaluating model...."); IDataView predictions = model.Transform(partitions.TestSet); // evaluate the predictions MulticlassClassificationMetrics metrics = context.MulticlassClassification.Evaluate(predictions); // show evaluation metrics Console.WriteLine($"Evaluation metrics"); Console.WriteLine($" MicroAccuracy: {metrics.MicroAccuracy:0.###}"); Console.WriteLine($" MacroAccuracy: {metrics.MacroAccuracy:0.###}"); Console.WriteLine($" LogLoss: {metrics.LogLoss:#.###}"); Console.WriteLine($" LogLossReduction: {metrics.LogLossReduction:#.###}"); Console.WriteLine(); }
public Mapper(ColumnConcatenatingTransformer parent, DataViewSchema inputSchema) : base(Contracts.CheckRef(parent, nameof(parent)).Host.Register(nameof(Mapper)), inputSchema, parent) { _parent = parent; _columns = new BoundColumn[_parent._columns.Length]; for (int i = 0; i < _parent._columns.Length; i++) { _columns[i] = MakeColumn(inputSchema, i); } }
public static CommonOutputs.TransformOutput PrepareFeatures(IHostEnvironment env, FeatureCombinerInput input) { const string featureCombiner = "FeatureCombiner"; Contracts.CheckValue(env, nameof(env)); var host = env.Register(featureCombiner); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); using (var ch = host.Start(featureCombiner)) { var viewTrain = input.Data; var rms = new RoleMappedSchema(viewTrain.Schema, input.GetRoles()); var feats = rms.GetColumns(RoleMappedSchema.ColumnRole.Feature); if (Utils.Size(feats) == 0) { throw ch.Except("No feature columns specified"); } var featNames = new HashSet <string>(); var concatNames = new List <KeyValuePair <string, string> >(); List <TypeConvertingEstimator.ColumnOptions> cvt; int errCount; var ktv = ConvertFeatures(feats.ToArray(), featNames, concatNames, ch, out cvt, out errCount); Contracts.Assert(featNames.Count > 0); Contracts.Assert(concatNames.Count == featNames.Count); if (errCount > 0) { throw ch.Except("Encountered {0} invalid training column(s)", errCount); } viewTrain = ApplyConvert(cvt, viewTrain, host); viewTrain = ApplyKeyToVec(ktv, viewTrain, host); // REVIEW: What about column name conflicts? Eg, what if someone uses the group id column // (a key type) as a feature column. We convert that column to a vector so it is no longer valid // as a group id. That's just one example - you get the idea. string nameFeat = DefaultColumnNames.Features; viewTrain = ColumnConcatenatingTransformer.Create(host, new ColumnConcatenatingTransformer.TaggedOptions() { Columns = new[] { new ColumnConcatenatingTransformer.TaggedColumn() { Name = nameFeat, Source = concatNames.ToArray() } } }, viewTrain); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, viewTrain, input.Data), OutputData = viewTrain }); } }
public static CommonOutputs.TransformOutput ConcatColumns(IHostEnvironment env, ColumnConcatenatingTransformer.Arguments input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("ConcatColumns"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); var xf = ColumnConcatenatingTransformer.Create(env, input, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }); }
public Mapper(ColumnConcatenatingTransformer parent, Schema inputSchema) { Contracts.AssertValue(parent); Contracts.AssertValue(inputSchema); _host = parent._host.Register(nameof(Mapper)); _parent = parent; _inputSchema = inputSchema; _columns = new BoundColumn[_parent._columns.Length]; for (int i = 0; i < _parent._columns.Length; i++) { _columns[i] = MakeColumn(inputSchema, i); } }
public void TrainAndPredictIrisModelUsingDirectInstantiationTest() { string dataPath = GetDataPath("iris.txt"); string testDataPath = dataPath; using (var env = new ConsoleEnvironment(seed: 1, conc: 1)) { // Pipeline var loader = TextLoader.ReadFile(env, new TextLoader.Arguments() { HasHeader = false, Column = new[] { new TextLoader.Column("Label", DataKind.R4, 0), new TextLoader.Column("SepalLength", DataKind.R4, 1), new TextLoader.Column("SepalWidth", DataKind.R4, 2), new TextLoader.Column("PetalLength", DataKind.R4, 3), new TextLoader.Column("PetalWidth", DataKind.R4, 4) } }, new MultiFileSource(dataPath)); IDataView pipeline = new ColumnConcatenatingTransformer(env, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth").Transform(loader); // NormalizingEstimator is not automatically added though the trainer has 'NormalizeFeatures' On/Auto pipeline = NormalizeTransform.CreateMinMaxNormalizer(env, pipeline, "Features"); // Train var trainer = new SdcaMultiClassTrainer(env, "Label", "Features", advancedSettings: (s) => s.NumThreads = 1); // Explicity adding CacheDataView since caching is not working though trainer has 'Caching' On/Auto var cached = new CacheDataView(env, pipeline, prefetch: null); var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features"); var pred = trainer.Train(trainRoles); // Get scorer and evaluate the predictions from test data IDataScorerTransform testDataScorer = GetScorer(env, pipeline, pred, testDataPath); var metrics = Evaluate(env, testDataScorer); CompareMatrics(metrics); // Create prediction engine and test predictions var model = env.CreatePredictionEngine <IrisData, IrisPrediction>(testDataScorer); ComparePredictions(model); // Get feature importance i.e. weight vector var summary = ((MulticlassLogisticRegressionPredictor)pred).GetSummaryInKeyValuePairs(trainRoles.Schema); Assert.Equal(7.757864, Convert.ToDouble(summary[0].Value), 5); } }
public static CommonOutputs.TransformOutput ConcatColumns(IHostEnvironment env, ColumnCopyingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("PrefixConcatColumns"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); // Get all column names with preserving order. var colNames = new List <string>(input.Data.Schema.Count); for (int i = 0; i < input.Data.Schema.Count; i++) { colNames.Add(input.Data.Schema[i].Name); } // Iterate through input options, find matching source columns, create new input options var inputOptions = new ColumnConcatenatingTransformer.Options() { Data = input.Data }; var columns = new List <ColumnConcatenatingTransformer.Column>(input.Columns.Length); foreach (var col in input.Columns) { var newCol = new ColumnConcatenatingTransformer.Column(); newCol.Name = col.Name; var prefix = col.Source; newCol.Source = colNames.Where(x => x.StartsWith(prefix, StringComparison.InvariantCulture)).ToArray(); if (newCol.Source.Length == 0) { throw new ArgumentOutOfRangeException("No matching columns found for prefix: " + prefix); } columns.Add(newCol); } inputOptions.Columns = columns.ToArray(); var xf = ColumnConcatenatingTransformer.Create(env, inputOptions, inputOptions.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf }); }
public static IDataView ApplyConcatOnSources(IHostEnvironment env, ManyToOneColumn[] columns, IDataView input) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(columns, nameof(columns)); env.CheckValue(input, nameof(input)); IDataView view = input; var concatCols = new List <ColumnConcatenatingTransformer.Column>(); foreach (var col in columns) { env.CheckUserArg(col != null, nameof(WordBagBuildingTransformer.Arguments.Column)); env.CheckUserArg(!string.IsNullOrWhiteSpace(col.Name), nameof(col.Name)); env.CheckUserArg(Utils.Size(col.Source) > 0, nameof(col.Source)); env.CheckUserArg(col.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(col.Source)); if (col.Source.Length > 1) { concatCols.Add( new ColumnConcatenatingTransformer.Column { Source = col.Source, Name = col.Name }); } } if (concatCols.Count > 0) { var concatArgs = new ColumnConcatenatingTransformer.Arguments { Column = concatCols.ToArray() }; return(ColumnConcatenatingTransformer.Create(env, concatArgs, view)); } return(view); }
public ParameterMixingCalibratedPredictor TrainKMeansAndLR() { using (var env = new ConsoleEnvironment(seed: 1, verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) { // Pipeline var loader = TextLoader.ReadFile(env, new TextLoader.Arguments() { HasHeader = true, Separator = ",", Column = new[] { new TextLoader.Column("Label", DataKind.R4, 14), new TextLoader.Column("CatFeatures", DataKind.TX, new [] { new TextLoader.Range() { Min = 1, Max = 1 }, new TextLoader.Range() { Min = 3, Max = 3 }, new TextLoader.Range() { Min = 5, Max = 9 }, new TextLoader.Range() { Min = 13, Max = 13 } }), new TextLoader.Column("NumFeatures", DataKind.R4, new [] { new TextLoader.Range() { Min = 0, Max = 0 }, new TextLoader.Range() { Min = 2, Max = 2 }, new TextLoader.Range() { Min = 4, Max = 4 }, new TextLoader.Range() { Min = 10, Max = 12 } }) } }, new MultiFileSource(_dataPath)); IDataView trans = new OneHotEncodingEstimator(env, "CatFeatures").Fit(loader).Transform(loader); trans = NormalizeTransform.CreateMinMaxNormalizer(env, trans, "NumFeatures"); trans = new ColumnConcatenatingTransformer(env, "Features", "NumFeatures", "CatFeatures").Transform(trans); trans = TrainAndScoreTransformer.Create(env, new TrainAndScoreTransformer.Arguments { Trainer = ComponentFactoryUtils.CreateFromFunction(host => new KMeansPlusPlusTrainer(host, "Features", advancedSettings: s => { s.K = 100; })), FeatureColumn = "Features" }, trans); trans = new ColumnConcatenatingTransformer(env, "Features", "Features", "Score").Transform(trans); // Train var trainer = new LogisticRegression(env, "Label", "Features", advancedSettings: args => { args.EnforceNonNegativity = true; args.OptTol = 1e-3f; }); var trainRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); return(trainer.Train(trainRoles)); } }
/// Factory method for SignatureDataTransform. internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("Categorical"); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns)); var replaceCols = new List <MissingValueReplacingEstimator.ColumnInfo>(); var naIndicatorCols = new List <MissingValueIndicatorTransformer.Column>(); var naConvCols = new List <TypeConvertingEstimator.ColumnInfo>(); var concatCols = new List <ColumnConcatenatingTransformer.TaggedColumn>(); var dropCols = new List <string>(); var tmpIsMissingColNames = input.Schema.GetTempColumnNames(options.Columns.Length, "IsMissing"); var tmpReplaceColNames = input.Schema.GetTempColumnNames(options.Columns.Length, "Replace"); for (int i = 0; i < options.Columns.Length; i++) { var column = options.Columns[i]; var addInd = column.ConcatIndicator ?? options.Concat; if (!addInd) { replaceCols.Add(new MissingValueReplacingEstimator.ColumnInfo(column.Name, column.Source, (MissingValueReplacingEstimator.ColumnInfo.ReplacementMode)(column.Kind ?? options.ReplaceWith), column.ImputeBySlot ?? options.ImputeBySlot)); continue; } // Check that the indicator column has a type that can be converted to the NAReplaceTransform output type, // so that they can be concatenated. if (!input.Schema.TryGetColumnIndex(column.Source, out int inputCol)) { throw h.Except("Column '{0}' does not exist", column.Source); } var replaceType = input.Schema[inputCol].Type; var replaceItemType = replaceType.GetItemType(); if (!Data.Conversion.Conversions.Instance.TryGetStandardConversion(BooleanDataViewType.Instance, replaceItemType, out Delegate conv, out bool identity)) { throw h.Except("Cannot concatenate indicator column of type '{0}' to input column of type '{1}'", BooleanDataViewType.Instance, replaceItemType); } // Find a temporary name for the NAReplaceTransform and NAIndicatorTransform output columns. var tmpIsMissingColName = tmpIsMissingColNames[i]; var tmpReplacementColName = tmpReplaceColNames[i]; // Add an NAHandleTransform column. naIndicatorCols.Add(new MissingValueIndicatorTransformer.Column() { Name = tmpIsMissingColName, Source = column.Source }); // Add a ConvertTransform column if necessary. if (!identity) { if (!replaceItemType.RawType.TryGetDataKind(out DataKind replaceItemTypeKind)) { throw h.Except("Cannot get a DataKind for type '{0}'", replaceItemType.RawType); } naConvCols.Add(new TypeConvertingEstimator.ColumnInfo(tmpIsMissingColName, replaceItemTypeKind, tmpIsMissingColName)); } // Add the NAReplaceTransform column. replaceCols.Add(new MissingValueReplacingEstimator.ColumnInfo(tmpReplacementColName, column.Source, (MissingValueReplacingEstimator.ColumnInfo.ReplacementMode)(column.Kind ?? options.ReplaceWith), column.ImputeBySlot ?? options.ImputeBySlot)); // Add the ConcatTransform column. if (replaceType is VectorType) { concatCols.Add(new ColumnConcatenatingTransformer.TaggedColumn() { Name = column.Name, Source = new[] { new KeyValuePair <string, string>(tmpReplacementColName, tmpReplacementColName), new KeyValuePair <string, string>("IsMissing", tmpIsMissingColName) } }); } else { concatCols.Add(new ColumnConcatenatingTransformer.TaggedColumn() { Name = column.Name, Source = new[] { new KeyValuePair <string, string>(column.Source, tmpReplacementColName), new KeyValuePair <string, string>(string.Format("IsMissing.{0}", column.Source), tmpIsMissingColName), } }); } // Add the temp column to the list of columns to drop at the end. dropCols.Add(tmpIsMissingColName); dropCols.Add(tmpReplacementColName); } IDataTransform output = null; // Create the indicator columns. if (naIndicatorCols.Count > 0) { output = MissingValueIndicatorTransformer.Create(h, new MissingValueIndicatorTransformer.Options() { Columns = naIndicatorCols.ToArray() }, input); } // Convert the indicator columns to the correct type so that they can be concatenated to the NAReplace outputs. if (naConvCols.Count > 0) { h.AssertValue(output); //REVIEW: all this need to be converted to estimatorChain as soon as we done with dropcolumns. output = new TypeConvertingTransformer(h, naConvCols.ToArray()).Transform(output) as IDataTransform; } // Create the NAReplace transform. output = MissingValueReplacingTransformer.Create(env, output ?? input, replaceCols.ToArray()); // Concat the NAReplaceTransform output and the NAIndicatorTransform output. if (naIndicatorCols.Count > 0) { output = ColumnConcatenatingTransformer.Create(h, new ColumnConcatenatingTransformer.TaggedOptions() { Columns = concatCols.ToArray() }, output); } // Finally, drop the temporary indicator columns. if (dropCols.Count > 0) { output = ColumnSelectingTransformer.CreateDrop(h, output, dropCols.ToArray()) as IDataTransform; } return(output); }