/// <summary> /// Loads multiple artifacts of interest from the input model file, given the context /// established by the command line arguments. /// </summary> /// <param name="ch">The channel to which to provide output.</param> /// <param name="wantPredictor">Whether we want a predictor from the model file. If /// <c>false</c> we will not even attempt to load a predictor. If <c>null</c> we will /// load the predictor, if present. If <c>true</c> we will load the predictor, or fail /// noisily if we cannot.</param> /// <param name="predictor">The predictor in the model, or <c>null</c> if /// <paramref name="wantPredictor"/> was false, or <paramref name="wantPredictor"/> was /// <c>null</c> and no predictor was present.</param> /// <param name="wantTrainSchema">Whether we want the training schema. Unlike /// <paramref name="wantPredictor"/>, this has no "hard fail if not present" option. If /// this is <c>true</c>, it is still possible for <paramref name="trainSchema"/> to remain /// <c>null</c> if there were no role mappings, or pipeline.</param> /// <param name="trainSchema">The training schema if <paramref name="wantTrainSchema"/> /// is true, and there were role mappings stored in the model.</param> /// <param name="pipe">The data pipe constructed from the combination of the /// model and command line arguments.</param> protected void LoadModelObjects( IChannel ch, bool?wantPredictor, out IPredictor predictor, bool wantTrainSchema, out RoleMappedSchema trainSchema, out ILegacyDataLoader pipe) { // First handle the case where there is no input model file. // Everything must come from the command line. using (var file = Host.OpenInputFile(ImplOptions.InputModelFile)) using (var strm = file.OpenReadStream()) using (var rep = RepositoryReader.Open(strm, Host)) { // First consider loading the predictor. if (wantPredictor == false) { predictor = null; } else { ch.Trace("Loading predictor"); predictor = ModelFileUtils.LoadPredictorOrNull(Host, rep); if (wantPredictor == true) { Host.Check(predictor != null, "Could not load predictor from model file"); } } // Next create the loader. var loaderFactory = ImplOptions.Loader; ILegacyDataLoader trainPipe = null; if (loaderFactory != null) { // The loader is overridden from the command line. pipe = loaderFactory.CreateComponent(Host, new MultiFileSource(ImplOptions.DataFile)); if (ImplOptions.LoadTransforms == true) { Host.CheckUserArg(!string.IsNullOrWhiteSpace(ImplOptions.InputModelFile), nameof(ImplOptions.InputModelFile)); pipe = LoadTransformChain(pipe); } } else { var loadTrans = ImplOptions.LoadTransforms ?? true; pipe = LoadLoader(rep, ImplOptions.DataFile, loadTrans); if (loadTrans) { trainPipe = pipe; } } if (Utils.Size(ImplOptions.Transforms) > 0) { pipe = LegacyCompositeDataLoader.Create(Host, pipe, ImplOptions.Transforms); } // Next consider loading the training data's role mapped schema. trainSchema = null; if (wantTrainSchema) { // First try to get the role mappings. var trainRoleMappings = ModelFileUtils.LoadRoleMappingsOrNull(Host, rep); if (trainRoleMappings != null) { // Next create the training schema. In the event that the loaded pipeline happens // to be the training pipe, we can just use that. If it differs, then we need to // load the full pipeline from the model, relying upon the fact that all loaders // can be loaded with no data at all, to get their schemas. if (trainPipe == null) { trainPipe = ModelFileUtils.LoadLoader(Host, rep, new MultiFileSource(null), loadTransforms: true); } trainSchema = new RoleMappedSchema(trainPipe.Schema, trainRoleMappings); } // If the role mappings are null, an alternative would be to fail. However the idea // is that the scorer should always still succeed, although perhaps with reduced // functionality, even when the training schema is null, since not all versions of // TLC models will have the role mappings preserved, I believe. And, we do want to // maintain backwards compatibility. } } }
public static IDataScorerTransform GetScorer(IPredictor predictor, RoleMappedData data, IHostEnvironment env, RoleMappedSchema trainSchema) { var sc = GetScorerComponentAndMapper(predictor, null, data.Schema, env, null, out var mapper); return(sc.CreateComponent(env, data.Data, mapper, trainSchema)); }
private void RunCore(IChannel ch) { Host.AssertValue(ch); ch.Trace("Creating loader"); LoadModelObjects(ch, true, out var predictor, true, out var trainSchema, out var loader); ch.AssertValue(predictor); ch.AssertValueOrNull(trainSchema); ch.AssertValue(loader); ch.Trace("Creating pipeline"); var scorer = Args.Scorer; ch.Assert(scorer == null || scorer is ICommandLineComponentFactory, "ScoreCommand should only be used from the command line."); var bindable = ScoreUtils.GetSchemaBindableMapper(Host, predictor, scorerFactorySettings: scorer as ICommandLineComponentFactory); ch.AssertValue(bindable); // REVIEW: We probably ought to prefer role mappings from the training schema. string feat = TrainUtils.MatchNameOrDefaultOrNull(ch, loader.Schema, nameof(Args.FeatureColumn), Args.FeatureColumn, DefaultColumnNames.Features); string group = TrainUtils.MatchNameOrDefaultOrNull(ch, loader.Schema, nameof(Args.GroupColumn), Args.GroupColumn, DefaultColumnNames.GroupId); var customCols = TrainUtils.CheckAndGenerateCustomColumns(ch, Args.CustomColumn); var schema = new RoleMappedSchema(loader.Schema, label: null, feature: feat, group: group, custom: customCols, opt: true); var mapper = bindable.Bind(Host, schema); if (scorer == null) { scorer = ScoreUtils.GetScorerComponent(Host, mapper); } loader = CompositeDataLoader.ApplyTransform(Host, loader, "Scorer", scorer.ToString(), (env, view) => scorer.CreateComponent(env, view, mapper, trainSchema)); loader = CompositeDataLoader.Create(Host, loader, Args.PostTransform); if (!string.IsNullOrWhiteSpace(Args.OutputModelFile)) { ch.Trace("Saving the data pipe"); SaveLoader(loader, Args.OutputModelFile); } ch.Trace("Creating saver"); IDataSaver writer; if (Args.Saver == null) { var ext = Path.GetExtension(Args.OutputDataFile); var isText = ext == ".txt" || ext == ".tlc"; if (isText) { writer = new TextSaver(Host, new TextSaver.Arguments()); } else { writer = new BinarySaver(Host, new BinarySaver.Arguments()); } } else { writer = Args.Saver.CreateComponent(Host); } ch.Assert(writer != null); var outputIsBinary = writer is BinaryWriter; bool outputAllColumns = Args.OutputAllColumns == true || (Args.OutputAllColumns == null && Utils.Size(Args.OutputColumn) == 0 && outputIsBinary); bool outputNamesAndLabels = Args.OutputAllColumns == true || Utils.Size(Args.OutputColumn) == 0; if (Args.OutputAllColumns == true && Utils.Size(Args.OutputColumn) != 0) { ch.Warning(nameof(Args.OutputAllColumns) + "=+ always writes all columns irrespective of " + nameof(Args.OutputColumn) + " specified."); } if (!outputAllColumns && Utils.Size(Args.OutputColumn) != 0) { foreach (var outCol in Args.OutputColumn) { if (!loader.Schema.TryGetColumnIndex(outCol, out int dummyColIndex)) { throw ch.ExceptUserArg(nameof(Arguments.OutputColumn), "Column '{0}' not found.", outCol); } } } uint maxScoreId = 0; if (!outputAllColumns) { maxScoreId = loader.Schema.GetMaxMetadataKind(out int colMax, MetadataUtils.Kinds.ScoreColumnSetId); } ch.Assert(outputAllColumns || maxScoreId > 0); // score set IDs are one-based var cols = new List <int>(); for (int i = 0; i < loader.Schema.Count; i++) { if (!Args.KeepHidden && loader.Schema[i].IsHidden) { continue; } if (!(outputAllColumns || ShouldAddColumn(loader.Schema, i, maxScoreId, outputNamesAndLabels))) { continue; } var type = loader.Schema[i].Type; if (writer.IsColumnSavable(type)) { cols.Add(i); } else { ch.Warning("The column '{0}' will not be written as it has unsavable column type.", loader.Schema[i].Name); } } ch.Check(cols.Count > 0, "No valid columns to save"); ch.Trace("Scoring and saving data"); using (var file = Host.CreateOutputFile(Args.OutputDataFile)) using (var stream = file.CreateWriteStream()) writer.SaveData(stream, loader, cols.ToArray()); }
public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper owner, RoleMappedSchema schema, string treesColumnName, string leavesColumnName, string pathsColumnName) { Contracts.AssertValue(ectx); ectx.AssertValue(owner); ectx.AssertValue(schema); ectx.Assert(schema.Feature.HasValue); _ectx = ectx; _owner = owner; InputRoleMappedSchema = schema; // A vector containing the output of each tree on a given example. var treeValueType = new VectorDataViewType(NumberDataViewType.Single, owner._ensemble.TrainedEnsemble.NumTrees); // An indicator vector with length = the total number of leaves in the ensemble, indicating which leaf the example // ends up in all the trees in the ensemble. var leafIdType = new VectorDataViewType(NumberDataViewType.Single, owner._totalLeafCount); // An indicator vector with length = the total number of nodes in the ensemble, indicating the nodes on // the paths of the example in all the trees in the ensemble. // The total number of nodes in a binary tree is equal to the number of internal nodes + the number of leaf nodes, // and it is also equal to the number of children of internal nodes (which is 2 * the number of internal nodes) // plus one (since the root node is not a child of any node). So we have #internal + #leaf = 2*(#internal) + 1, // which means that #internal = #leaf - 1. // Therefore, the number of internal nodes in the ensemble is #leaf - #trees. var pathIdType = new VectorDataViewType(NumberDataViewType.Single, owner._totalLeafCount - owner._ensemble.TrainedEnsemble.NumTrees); // Start creating output schema with types derived above. var schemaBuilder = new DataViewSchema.Builder(); _treesColumnName = treesColumnName; if (treesColumnName != null) { // Metadata of tree values. var treeIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); treeIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(treeValueType.Size), (ValueGetter <VBuffer <ReadOnlyMemory <char> > >)owner.GetTreeSlotNames); // Add the column of trees' output values schemaBuilder.AddColumn(treesColumnName, treeValueType, treeIdMetadataBuilder.ToAnnotations()); } _leavesColumnName = leavesColumnName; if (leavesColumnName != null) { // Metadata of leaf IDs. var leafIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(leafIdType.Size), (ValueGetter <VBuffer <ReadOnlyMemory <char> > >)owner.GetLeafSlotNames); leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); // Add the column of leaves' IDs where the input example reaches. schemaBuilder.AddColumn(leavesColumnName, leafIdType, leafIdMetadataBuilder.ToAnnotations()); } _pathsColumnName = pathsColumnName; if (pathsColumnName != null) { // Metadata of path IDs. var pathIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(pathIdType.Size), (ValueGetter <VBuffer <ReadOnlyMemory <char> > >)owner.GetPathSlotNames); pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); // Add the column of encoded paths which the input example passes. schemaBuilder.AddColumn(pathsColumnName, pathIdType, pathIdMetadataBuilder.ToAnnotations()); } OutputSchema = schemaBuilder.ToSchema(); }
private protected override Aggregator GetAggregatorCore(RoleMappedSchema schema, string stratName) { return(new Aggregator(Host, LossFunction, schema.Weight != null, stratName)); }
/// <summary> /// Creates a data scorer from the 'LoadName{settings}' string. /// </summary> /// <param name="env">The host environment to use.</param> /// <param name="settings">The settings string.</param> /// <param name="data">The data to score.</param> /// <param name="predictor">The predictor to score.</param> /// <param name="trainSchema">The training data schema from which the scorer can optionally extract /// additional information, for example, label names. If this is <c>null</c>, no information will be /// extracted.</param> /// <returns>The scored data.</returns> internal static IDataScorerTransform CreateScorer(this IHostEnvironment env, string settings, RoleMappedData data, IPredictor predictor, RoleMappedSchema trainSchema = null) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(data, nameof(data)); env.CheckValue(predictor, nameof(predictor)); env.CheckValueOrNull(trainSchema); Type factoryType = typeof(IComponentFactory <IDataView, ISchemaBoundMapper, RoleMappedSchema, IDataScorerTransform>); Type signatureType = typeof(SignatureDataScorer); ICommandLineComponentFactory scorerFactorySettings = CmdParser.CreateComponentFactory( factoryType, signatureType, settings); var bindable = ScoreUtils.GetSchemaBindableMapper(env, predictor, scorerFactorySettings: scorerFactorySettings); var mapper = bindable.Bind(env, data.Schema); return(CreateCore <IDataScorerTransform>(env, factoryType, signatureType, settings, data.Data, mapper, trainSchema)); }
internal static IDataScorerTransform CreateDefaultScorer(this IHostEnvironment env, RoleMappedData data, IPredictor predictor, RoleMappedSchema trainSchema = null) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(data, nameof(data)); env.CheckValue(predictor, nameof(predictor)); env.CheckValueOrNull(trainSchema); return(ScoreUtils.GetScorer(predictor, data, env, trainSchema)); }
internal static ISchemaBoundMapper WrapCore <T>(IHostEnvironment env, ISchemaBoundMapper mapper, RoleMappedSchema trainSchema) { Contracts.AssertValue(env); env.AssertValue(mapper); env.AssertValue(trainSchema); env.Assert(mapper is ISchemaBoundRowMapper); // Key values from the training schema label, will map to slot names of the score output. var type = trainSchema.Label.Value.Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.KeyValues)?.Type; env.AssertValue(type); env.Assert(type is VectorType); // Wrap the fetching of the metadata as a simple getter. ValueGetter <VBuffer <T> > getter = (ref VBuffer <T> value) => { trainSchema.Label.Value.GetKeyValues(ref value); }; return(LabelNameBindableMapper.CreateBound <T>(env, (ISchemaBoundRowMapper)mapper, type as VectorType, getter, AnnotationUtils.Kinds.SlotNames, CanWrap)); }
internal MultiClassClassifierScorer(IHostEnvironment env, Arguments args, IDataView data, ISchemaBoundMapper mapper, RoleMappedSchema trainSchema) : base(args, env, data, WrapIfNeeded(env, mapper, trainSchema), trainSchema, RegistrationName, AnnotationUtils.Const.ScoreColumnKind.MultiClassClassification, AnnotationUtils.Const.ScoreValueKind.Score, OutputTypeMatches, GetPredColType) { }
// Factory method for SignatureDataScorer. private static IDataScorerTransform Create(IHostEnvironment env, Arguments args, IDataView data, ISchemaBoundMapper mapper, RoleMappedSchema trainSchema) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(data, nameof(data)); env.CheckValue(mapper, nameof(mapper)); if (args.Top < 0) { throw env.Except($"Number of top contribution must be non negative"); } if (args.Bottom < 0) { throw env.Except($"Number of bottom contribution must be non negative"); } var contributionMapper = mapper as RowMapper; env.CheckParam(mapper != null, nameof(mapper), "Unexpected mapper"); var scorer = ScoreUtils.GetScorerComponent(env, contributionMapper); var scoredPipe = scorer.CreateComponent(env, data, contributionMapper, trainSchema); return(scoredPipe); }
/// <summary> /// This function performs a number of checks on the inputs and, if appropriate and possible, will produce /// a mapper with slots names on the output score column properly mapped. If this is not possible for any /// reason, it will just return the input bound mapper. /// </summary> private static ISchemaBoundMapper WrapIfNeeded(IHostEnvironment env, ISchemaBoundMapper mapper, RoleMappedSchema trainSchema) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(mapper, nameof(mapper)); env.CheckValueOrNull(trainSchema); // The idea is that we will take the key values from the train schema label, and present // them as slot name metadata. But there are a number of conditions for this to actually // happen, so we test those here. If these are not if (trainSchema?.Label == null) { return(mapper); // We don't even have a label identified in a training schema. } var keyType = trainSchema.Label.Value.Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.KeyValues)?.Type as VectorType; if (keyType == null || !CanWrap(mapper, keyType)) { return(mapper); } // Great!! All checks pass. return(Utils.MarshalInvoke(WrapCore <int>, keyType.ItemType.RawType, env, mapper, trainSchema)); }
private protected virtual void CheckCustomColumnTypesCore(RoleMappedSchema schema) { }
private protected abstract void CheckScoreAndLabelTypes(RoleMappedSchema schema);
// This method does as many passes over the data as needed by the evaluator, and computes the metrics, outputting the // results in a dictionary from the metric kind (overal/per-fold/confusion matrix/PR-curves etc.), to a data view containing // the metric. If there are stratified metrics, an additional column is added to the data view containing the // stratification value as text in the format "column x = y". private Dictionary <string, IDataView> ProcessData(IDataView data, RoleMappedSchema schema, Func <int, bool> activeColsIndices, TAgg aggregator, AggregatorDictionaryBase[] dictionaries) { Func <bool> finishPass = () => { var need = aggregator.FinishPass(); foreach (var agg in dictionaries.SelectMany(dict => dict.GetAll())) { need |= agg.FinishPass(); } return(need); }; bool needMorePasses = aggregator.Start(); var activeCols = data.Schema.Where(x => activeColsIndices(x.Index)); // REVIEW: Add progress reporting. while (needMorePasses) { using (var cursor = data.GetRowCursor(activeCols)) { if (aggregator.IsActive()) { aggregator.InitializeNextPass(cursor, schema); } for (int i = 0; i < Utils.Size(dictionaries); i++) { dictionaries[i].Reset(cursor); foreach (var agg in dictionaries[i].GetAll()) { if (agg.IsActive()) { agg.InitializeNextPass(cursor, schema); } } } while (cursor.MoveNext()) { if (aggregator.IsActive()) { aggregator.ProcessRow(); } for (int i = 0; i < Utils.Size(dictionaries); i++) { var agg = dictionaries[i].Get(); if (agg.IsActive()) { agg.ProcessRow(); } } } } needMorePasses = finishPass(); } Action <uint, ReadOnlyMemory <char>, TAgg> addAgg; Func <Dictionary <string, IDataView> > consolidate; GetAggregatorConsolidationFuncs(aggregator, dictionaries, out addAgg, out consolidate); uint stratColKey = 0; addAgg(stratColKey, default, aggregator);
private protected abstract TAgg GetAggregatorCore(RoleMappedSchema schema, string stratName);
/// <summary> /// Get an aggregator for the specific evaluator given the current RoleMappedSchema. /// </summary> private TAgg GetAggregator(RoleMappedSchema schema) { return(GetAggregatorCore(schema, "")); }