/// <param name="env">The environment.</param> /// <param name="registrationName">The registration name.</param> /// <param name="inputDataView">The input data view.</param> /// <param name="splitColumn">The column to use for splitting data into folds.</param> /// <param name="args">Cross validation arguments.</param> /// <param name="createExamples">The delegate to create RoleMappedData</param> /// <param name="applyTransformsToTestData">The delegate to apply the transforms from the train pipeline to the test data</param> /// <param name="scorer">The scorer</param> /// <param name="evaluator">The evaluator</param> /// <param name="getValidationDataView">The delegate to create validation data view</param> /// <param name="applyTransformsToValidationData">The delegate to apply the transforms from the train pipeline to the validation data</param> /// <param name="inputPredictor">The input predictor, for the continue training option</param> /// <param name="cmd">The command string.</param> /// <param name="loader">Original loader so we can construct correct pipeline for model saving.</param> /// <param name="savePerInstance">Whether to produce the per-instance data view.</param> /// <returns></returns> public FoldHelper( IHostEnvironment env, string registrationName, IDataView inputDataView, string splitColumn, Arguments args, Func <IHostEnvironment, IChannel, IDataView, ITrainer, RoleMappedData> createExamples, Func <IHostEnvironment, IChannel, IDataView, RoleMappedData, IDataView, RoleMappedData> applyTransformsToTestData, SubComponent <IDataScorerTransform, SignatureDataScorer> scorer, SubComponent <IMamlEvaluator, SignatureMamlEvaluator> evaluator, Func <IDataView> getValidationDataView = null, Func <IHostEnvironment, IChannel, IDataView, RoleMappedData, IDataView, RoleMappedData> applyTransformsToValidationData = null, IPredictor inputPredictor = null, string cmd = null, IDataLoader loader = null, bool savePerInstance = false) { Contracts.CheckValue(env, nameof(env)); env.CheckNonWhiteSpace(registrationName, nameof(registrationName)); env.CheckValue(inputDataView, nameof(inputDataView)); env.CheckValue(splitColumn, nameof(splitColumn)); env.CheckParam(args.NumFolds > 1, nameof(args.NumFolds)); env.CheckValue(createExamples, nameof(createExamples)); env.CheckValue(applyTransformsToTestData, nameof(applyTransformsToTestData)); env.CheckParam(args.Trainer.IsGood(), nameof(args.Trainer)); env.CheckValueOrNull(scorer); env.CheckValueOrNull(evaluator); env.CheckValueOrNull(args.Calibrator); env.CheckParam(args.MaxCalibrationExamples > 0, nameof(args.MaxCalibrationExamples)); env.CheckParam(getValidationDataView == null || applyTransformsToValidationData != null, nameof(applyTransformsToValidationData)); env.CheckValueOrNull(inputPredictor); env.CheckValueOrNull(cmd); env.CheckValueOrNull(args.OutputModelFile); env.CheckValueOrNull(loader); _env = env; _registrationName = registrationName; _inputDataView = inputDataView; _splitColumn = splitColumn; _numFolds = args.NumFolds; _createExamples = createExamples; _applyTransformsToTestData = applyTransformsToTestData; _trainer = args.Trainer; _scorer = scorer; _evaluator = evaluator; _calibrator = args.Calibrator; _maxCalibrationExamples = args.MaxCalibrationExamples; _useThreads = args.UseThreads; _cacheData = args.CacheData; _getValidationDataView = getValidationDataView; _applyTransformsToValidationData = applyTransformsToValidationData; _inputPredictor = inputPredictor; _cmd = cmd; _outputModelFile = args.OutputModelFile; _loader = loader; _savePerInstance = savePerInstance; }
private void CheckInputSchema(ISchema schema, int matrixColumnIndexCol, int matrixRowIndexCol) { // See if matrix-column-index role's type matches the one expected in the trained predictor var type = schema.GetColumnType(matrixColumnIndexCol); string msg = string.Format("Input column index type '{0}' incompatible with predictor's column index type '{1}'", type, _parent.MatrixColumnIndexType); _env.CheckParam(type.Equals(_parent.MatrixColumnIndexType), nameof(schema), msg); // See if matrix-column-index role's type matches the one expected in the trained predictor type = schema.GetColumnType(matrixRowIndexCol); msg = string.Format("Input row index type '{0}' incompatible with predictor' row index type '{1}'", type, _parent.MatrixRowIndexType); _env.CheckParam(type.Equals(_parent.MatrixRowIndexType), nameof(schema), msg); }
/// <summary> /// Filter the dataset by the values of a numeric column. /// </summary> /// <remarks> /// Keep only those rows that satisfy the range condition: the value of column <paramref name="columnName"/> /// must be between <paramref name="lowerBound"/> (inclusive) and <paramref name="upperBound"/> (exclusive). /// </remarks> /// <param name="input">The input data.</param> /// <param name="columnName">The name of a column to use for filtering.</param> /// <param name="lowerBound">The inclusive lower bound.</param> /// <param name="upperBound">The exclusive upper bound.</param> /// <example> /// <format type="text/markdown"> /// <![CDATA[ /// [!code-csharp[FilterRowsByColumn](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/FilterRowsByColumn.cs)] /// ]]> /// </format> /// </example> public IDataView FilterRowsByColumn(IDataView input, string columnName, double lowerBound = double.NegativeInfinity, double upperBound = double.PositiveInfinity) { _env.CheckValue(input, nameof(input)); _env.CheckNonEmpty(columnName, nameof(columnName)); _env.CheckParam(lowerBound < upperBound, nameof(upperBound), "Must be less than lowerBound"); var type = input.Schema[columnName].Type; if (!(type is NumberDataViewType)) { throw _env.ExceptSchemaMismatch(nameof(columnName), "filter", columnName, "number", type.ToString()); } return(new RangeFilter(_env, input, columnName, lowerBound, upperBound, false)); }
/// <summary> /// Create a prediction engine. /// This encapsulates the 'classic' prediction problem, where the input is denoted by the float array of features, /// and the output is a float score. For binary classification predictors that can output probability, there are output /// fields that report the predicted label and probability. /// </summary> /// <param name="env">The host environment to use.</param> /// <param name="modelStream">The model stream to load pipeline from.</param> /// <param name="nFeatures">Number of features.</param> public static SimplePredictionEngine CreateSimplePredictionEngine(this IHostEnvironment env, Stream modelStream, int nFeatures) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(modelStream, nameof(modelStream)); env.CheckParam(nFeatures > 0, nameof(nFeatures), "Number of features must be positive."); return(new SimplePredictionEngine(env, modelStream, nFeatures)); }
public static BindingsImpl Create(ModelLoadContext ctx, DataViewSchema input, IHostEnvironment env, ISchemaBindableMapper bindable, Func <DataViewType, bool> outputTypeMatches, Func <DataViewType, ISchemaBoundRowMapper, DataViewType> getPredColType) { Contracts.AssertValue(env); env.AssertValue(ctx); // *** Binary format *** // <base info> // int: id of the scores column kind (metadata output) // int: id of the column used for deriving the predicted label column string suffix; var roles = LoadBaseInfo(ctx, out suffix); string scoreKind = ctx.LoadNonEmptyString(); string scoreCol = ctx.LoadNonEmptyString(); var mapper = bindable.Bind(env, new RoleMappedSchema(input, roles)); var rowMapper = mapper as ISchemaBoundRowMapper; env.CheckParam(rowMapper != null, nameof(bindable), "Bindable expected to be an " + nameof(ISchemaBindableMapper) + "!"); // Find the score column of the mapper. int scoreColIndex; env.CheckDecode(mapper.OutputSchema.TryGetColumnIndex(scoreCol, out scoreColIndex)); var scoreType = mapper.OutputSchema[scoreColIndex].Type; env.CheckDecode(outputTypeMatches(scoreType)); var predColType = getPredColType(scoreType, rowMapper); return(new BindingsImpl(input, rowMapper, suffix, scoreKind, false, scoreColIndex, predColType)); }
/// <summary> /// Given a row, returns a one-row data view. This is useful for cases where you have a row, and you /// wish to use some facility normally only exposed to dataviews. (For example, you have an <see cref="DataViewRow"/> /// but want to save it somewhere using a <see cref="Microsoft.ML.Data.IO.BinarySaver"/>.) /// Note that it is not possible for this method to ensure that the input <paramref name="row"/> does not /// change, so users of this convenience must take care of what they do with the input row or the data /// source it came from, while the returned dataview is potentially being used. /// </summary> /// <param name="env">An environment used to create the host for the resulting data view</param> /// <param name="row">A row, whose columns must all be active</param> /// <returns>A single-row data view incorporating that row</returns> public static IDataView RowAsDataView(IHostEnvironment env, DataViewRow row) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(row, nameof(row)); env.CheckParam(Enumerable.Range(0, row.Schema.Count).All(c => row.IsColumnActive(c)), nameof(row), "Some columns were inactive"); return(new OneRowDataView(env, row)); }
/// <summary> /// Gets the coefficient statistics as an object. /// </summary> public CoefficientStatistics[] GetCoefficientStatistics(LinearBinaryPredictor parent, RoleMappedSchema schema, int paramCountCap) { Contracts.AssertValue(_env); _env.CheckValue(parent, nameof(parent)); _env.CheckValue(schema, nameof(schema)); _env.CheckParam(paramCountCap >= 0, nameof(paramCountCap)); if (paramCountCap > _paramCount) { paramCountCap = _paramCount; } Single stdError; Single zScore; Single pValue; var bias = parent.Bias; if (!TryGetBiasStatistics(parent.Statistics, bias, out stdError, out zScore, out pValue)) { return(null); } var order = GetUnorderedCoefficientStatistics(parent, schema).OrderByDescending(stat => stat.ZScore).Take(paramCountCap - 1); return(order.Prepend(new CoefficientStatistics("(Bias)", bias, stdError, zScore, pValue)).ToArray()); }
public ISchemaBoundMapper Bind(IHostEnvironment env, RoleMappedSchema schema) { Contracts.AssertValue(env); env.AssertValue(schema); env.CheckParam(schema.Feature != null, nameof(schema), "Need a feature column"); return(new BoundMapper(env, this, schema)); }
/// <summary> /// Checkpoints <see cref="TimeSeriesPredictionEngine{TSrc, TDst}"/> to disk with the updated /// state. /// </summary> /// <param name="env">Usually <see cref="MLContext"/>.</param> /// <param name="modelPath">Path to file on disk where the updated model needs to be saved.</param> /// <example> /// <format type="text/markdown"> /// <![CDATA[ /// This is an example for checkpointing time series that detects change point using Singular Spectrum Analysis (SSA) model. /// [!code-csharp[Checkpoint](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectChangePointBySsa.cs)] /// ]]> /// </format> /// </example> public void CheckPoint(IHostEnvironment env, string modelPath) { Contracts.CheckValue(env, nameof(env)); env.CheckParam(!string.IsNullOrEmpty(modelPath), nameof(modelPath)); using (var file = File.Create(modelPath)) CheckPoint(env, file); }
// Factory method for SignatureBindableMapper. private static ISchemaBindableMapper Create(IHostEnvironment env, Arguments args, IPredictor predictor) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(predictor, nameof(predictor)); var pred = predictor as IFeatureContributionMapper; env.CheckParam(pred != null, nameof(predictor), "Predictor doesn't support getting feature contributions"); return(new BindableMapper(env, pred, args.Top, args.Bottom, args.Normalize, args.Stringify)); }
private static Func <Schema, IRowToRowMapper> StreamChecker(IHostEnvironment env, Stream modelStream) { env.CheckValue(modelStream, nameof(modelStream)); return(schema => { var pipe = DataViewConstructionUtils.LoadPipeWithPredictor(env, modelStream, new EmptyDataView(env, schema)); var transformer = new TransformWrapper(env, pipe); env.CheckParam(transformer.IsRowToRowMapper, nameof(transformer), "Must be a row to row mapper"); return transformer.GetRowToRowMapper(schema); }); }
public TypeName(IHostEnvironment env, float p, int foo) { Contracts.CheckValue(env, nameof(env)); env.CheckParam(0 <= p && p <= 1, nameof(p), "Should be in range [0,1]"); env.CheckParam(0 <= p && p <= 1, "p"); // Should fail. env.CheckParam(0 <= p && p <= 1, nameof(p) + nameof(p)); // Should fail. env.CheckValue(paramName: nameof(p), val: "p"); // Should succeed despite confusing order. env.CheckValue(paramName: "p", val: nameof(p)); // Should fail despite confusing order. env.CheckValue("p", nameof(p)); env.CheckUserArg(foo > 5, "foo", "Nice"); env.CheckUserArg(foo > 5, nameof(foo), "Nice"); env.Except(); // Not throwing or doing anything with the exception, so should fail. Contracts.ExceptParam(nameof(env), "What a silly env"); // Should also fail. if (false) { throw env.Except(); // Should not fail. } if (false) { throw env.ExceptParam(nameof(env), "What a silly env"); // Should not fail. } if (false) { throw env.ExceptParam("env", "What a silly env"); // Should fail due to name error. } var e = env.Except(); env.Check(true, $"Hello {foo} is cool"); env.Check(true, "Hello it is cool"); string coolMessage = "Hello it is cool"; env.Check(true, coolMessage); env.Check(true, string.Format("Hello {0} is cool", foo)); env.Check(true, Messages.CoolMessage); env.CheckDecode(true, "Not suspicious, no ModelLoadContext"); Contracts.Check(true, "Fine: " + nameof(env)); Contracts.Check(true, "Less fine: " + env.GetType().Name); Contracts.CheckUserArg(0 <= p && p <= 1, "p", "On a new line"); }
// REVIEW: AppendRowsDataView now only checks schema consistency up to column names and types. // A future task will be to ensure that the sources are consistent on the metadata level. /// <summary> /// Create a dataview by appending the rows of the sources. /// /// All sources must be consistent with the passed-in schema in the number of columns, column names, /// and column types. If schema is null, the first source's schema will be used. /// </summary> /// <param name="env">The host environment.</param> /// <param name="schema">The schema for the result. If this is null, the first source's schema will be used.</param> /// <param name="sources">The sources to be appended.</param> /// <returns>The resulting IDataView.</returns> public static IDataView Create(IHostEnvironment env, Schema schema, params IDataView[] sources) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(sources, nameof(sources)); env.CheckNonEmpty(sources, nameof(sources), "There must be at least one source."); env.CheckParam(sources.All(s => s != null), nameof(sources)); env.CheckValueOrNull(schema); if (sources.Length == 1) { return(sources[0]); } return(new AppendRowsDataView(env, schema, sources)); }
/// <summary> /// Checkpoints <see cref="TimeSeriesPredictionEngine{TSrc, TDst}"/> to a <see cref="Stream"/> with the updated /// state. /// </summary> /// <param name="env">Usually <see cref="MLContext"/>.</param> /// <param name="stream">Stream where the updated model needs to be saved.</param> /// <example> /// <format type="text/markdown"> /// <![CDATA[ /// This is an example for checkpointing time series that detects change point using Singular Spectrum Analysis (SSA) model. /// [!code-csharp[Checkpoint](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TimeSeries/DetectChangePointBySsaStream.cs)] /// ]]> /// </format> /// </example> public void CheckPoint(IHostEnvironment env, Stream stream) { Contracts.CheckValue(env, nameof(env)); env.CheckParam(stream != null, nameof(stream)); if (Transformer is ITransformerChainAccessor transformerChainAccessor) { new TransformerChain <ITransformer>(transformerChainAccessor.Transformers, transformerChainAccessor.Scopes).SaveTo(env, stream); } else { Transformer.SaveTo(env, stream); } }
public BindingsImpl ApplyToSchema(DataViewSchema input, ISchemaBindableMapper bindable, IHostEnvironment env) { Contracts.AssertValue(env); env.AssertValue(input); env.AssertValue(bindable); string scoreCol = RowMapper.OutputSchema[ScoreColumnIndex].Name; var schema = new RoleMappedSchema(input, RowMapper.GetInputColumnRoles()); // Checks compatibility of the predictor input types. var mapper = bindable.Bind(env, schema); var rowMapper = mapper as ISchemaBoundRowMapper; env.CheckParam(rowMapper != null, nameof(bindable), "Mapper must implement ISchemaBoundRowMapper"); int mapperScoreColumn; bool tmp = rowMapper.OutputSchema.TryGetColumnIndex(scoreCol, out mapperScoreColumn); env.Check(tmp, "Mapper doesn't have expected score column"); return new BindingsImpl(input, rowMapper, Suffix, ScoreColumnKind, true, mapperScoreColumn, PredColType); }
public static ISchemaBindableMapper Create(IHostEnvironment env, Arguments args, IPredictor predictor) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(args, nameof(args)); env.CheckValue(predictor, nameof(predictor)); if (args.Top <= 0 || args.Top > MaxTopBottom) { throw env.Except($"Number of top contribution must be in range (0,{MaxTopBottom}]"); } if (args.Bottom <= 0 || args.Bottom > MaxTopBottom) { throw env.Except($"Number of bottom contribution must be in range (0,{MaxTopBottom}]"); } var pred = predictor as IFeatureContributionMapper; env.CheckParam(pred != null, nameof(predictor), "Predictor doesn't support getting feature contributions"); return(new BindableMapper(env, pred, args.Top, args.Bottom, args.Normalize, args.Stringify)); }
/// <summary> /// The degree of concurrency is passed in the conc parameter. If it is null, the value /// of args.parralel is used. If that is null, zero is used (which means "automatic"). /// </summary> protected ImplBase(IHostEnvironment env, TArgs args, string name, int?conc = null) { Contracts.CheckValue(env, nameof(env)); // Note that env may be null here, which is OK since the CheckXxx methods are extension // methods designed to allow null. env.CheckValue(args, nameof(args)); env.CheckParam(conc == null || conc >= 0, nameof(conc), "Degree of concurrency must be non-negative (or null)"); conc = conc ?? args.Parallel; env.CheckUserArg(!(conc < 0), nameof(args.Parallel), "Degree of parallelism must be non-negative (or null)"); // Capture the environment options from args. env = env.Register(name, args.RandomSeed, args.Verbose, conc); env.CheckNonWhiteSpace(name, nameof(name)); Host = env.Register(name); Args = args; _serverFactory = args.Server; Utils.CheckOptionalUserDirectory(args.OutputModelFile, nameof(args.OutputModelFile)); }
private protected CalibratorEstimatorBase(IHostEnvironment env, ICalibratorTrainer calibratorTrainer, string labelColumn, string scoreColumn, string weightColumn) { Host = env; _calibratorTrainer = calibratorTrainer; if (!string.IsNullOrWhiteSpace(labelColumn)) { LabelColumn = TrainerUtils.MakeBoolScalarLabel(labelColumn); } else { env.CheckParam(!calibratorTrainer.NeedsTraining, nameof(labelColumn), "For trained calibrators, " + nameof(labelColumn) + " must be specified."); } ScoreColumn = TrainerUtils.MakeR4ScalarColumn(scoreColumn); // Do we fanthom this being named anything else (renaming column)? Complete metadata? if (weightColumn != null) { WeightColumn = TrainerUtils.MakeR4ScalarWeightColumn(weightColumn); } }
// Factory method for SignatureDataScorer. private static IDataScorerTransform Create(IHostEnvironment env, Arguments args, IDataView data, ISchemaBoundMapper mapper, RoleMappedSchema trainSchema) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(data, nameof(data)); env.CheckValue(mapper, nameof(mapper)); if (args.Top < 0) { throw env.Except($"Number of top contribution must be non negative"); } if (args.Bottom < 0) { throw env.Except($"Number of bottom contribution must be non negative"); } var contributionMapper = mapper as RowMapper; env.CheckParam(mapper != null, nameof(mapper), "Unexpected mapper"); var scorer = ScoreUtils.GetScorerComponent(env, contributionMapper); var scoredPipe = scorer.CreateComponent(env, data, contributionMapper, trainSchema); return(scoredPipe); }
/// <summary> /// Returns the feature selection scores for each slot of each column. /// </summary> /// <param name="env">The host environment.</param> /// <param name="input">The input dataview.</param> /// <param name="columns">The columns for which to compute the feature selection scores.</param> /// <param name="colSizes">Outputs an array containing the vector sizes of the input columns</param> /// <returns>A list of scores.</returns> public static long[][] Train(IHostEnvironment env, IDataView input, string[] columns, out int[] colSizes) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(input, nameof(input)); env.CheckParam(Utils.Size(columns) > 0, nameof(columns)); var schema = input.Schema; var size = columns.Length; var activeInput = new bool[schema.ColumnCount]; var colSrcs = new int[size]; var colTypes = new ColumnType[size]; colSizes = new int[size]; for (int i = 0; i < size; i++) { int colSrc; var colName = columns[i]; if (!schema.TryGetColumnIndex(colName, out colSrc)) { throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Source column '{0}' not found", colName); } var colType = schema.GetColumnType(colSrc); if (colType.IsVector && !colType.IsKnownSizeVector) { throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Variable length column '{0}' is not allowed", colName); } activeInput[colSrc] = true; colSrcs[i] = colSrc; colTypes[i] = colType; colSizes[i] = colType.ValueCount; } var aggregators = new CountAggregator[size]; long rowCur = 0; double rowCount = input.GetRowCount(true) ?? double.NaN; using (var pch = env.StartProgressChannel("Aggregating counts")) using (var cursor = input.GetRowCursor(col => activeInput[col])) { var header = new ProgressHeader(new[] { "rows" }); pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); }); for (int i = 0; i < size; i++) { if (colTypes[i].IsVector) { aggregators[i] = GetVecAggregator(cursor, colTypes[i], colSrcs[i]); } else { aggregators[i] = GetOneAggregator(cursor, colTypes[i], colSrcs[i]); } } while (cursor.MoveNext()) { for (int i = 0; i < size; i++) { aggregators[i].ProcessValue(); } rowCur++; } pch.Checkpoint(rowCur); } return(aggregators.Select(a => a.Count).ToArray()); }
public XGBoostScalarRowMapper(RoleMappedSchema schema, XGBoostPredictorBase <Float> parent, IHostEnvironment env, ISchema outputSchema) : base(schema, parent, env, outputSchema) { env.CheckParam(outputSchema.ColumnCount == 1, nameof(outputSchema)); env.CheckParam(outputSchema.GetColumnType(0).IsNumber, nameof(outputSchema)); }