public Bindings(ArgumentsBase args, bool keep, ISchema schemaInput) { Contracts.AssertValue(args); Contracts.AssertNonEmpty(args.Columns); Contracts.AssertValue(schemaInput); Keep = keep; Input = schemaInput; Names = new HashSet <string>(); for (int i = 0; i < args.Columns.Length; i++) { var name = args.Columns[i]; Contracts.CheckNonWhiteSpace(name, nameof(args.Columns)); // REVIEW: Should this just be a warning? if (!Names.Add(name)) { throw Contracts.ExceptUserArg(nameof(args.Columns), "Column '{0}' specified multiple times", name); } } BuildMap(out ColMap, out NameToCol); }
/// <summary> /// Initializes the <see cref="MetaMulticlassTrainer{TTransformer, TModel}"/> from the Arguments class. /// </summary> /// <param name="env">The private instance of the <see cref="IHostEnvironment"/>.</param> /// <param name="args">The legacy arguments <see cref="ArgumentsBase"/>class.</param> /// <param name="name">The component name.</param> /// <param name="labelColumn">The label column for the metalinear trainer and the binary trainer.</param> /// <param name="singleEstimator">The binary estimator.</param> /// <param name="calibrator">The calibrator. If a calibrator is not explicitly provided, it will default to <see cref="PlattCalibratorTrainer"/></param> internal MetaMulticlassTrainer(IHostEnvironment env, ArgumentsBase args, string name, string labelColumn = null, TScalarTrainer singleEstimator = null, ICalibratorTrainer calibrator = null) { Host = Contracts.CheckRef(env, nameof(env)).Register(name); Host.CheckValue(args, nameof(args)); Args = args; if (labelColumn != null) { LabelColumn = new SchemaShape.Column(labelColumn, SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true); } Trainer = singleEstimator ?? CreateTrainer(); Calibrator = calibrator ?? new PlattCalibratorTrainer(env); if (args.Calibrator != null) { Calibrator = args.Calibrator.CreateComponent(Host); } // Regarding caching, no matter what the internal predictor, we're performing many passes // simply by virtue of this being a meta-trainer, so we will still cache. Info = new TrainerInfo(normalization: Trainer.Info.NeedNormalization); }
private protected EnsembleTrainerBase(ArgumentsBase args, IHostEnvironment env, string name) { Contracts.CheckValue(env, nameof(env)); Host = env.Register(name); Args = args; using (var ch = Host.Start("Init")) { var predictorFactories = Args.GetPredictorFactories(); ch.CheckUserArg(Utils.Size(predictorFactories) > 0, nameof(EnsembleTrainer.Arguments.BasePredictors), "This should have at-least one value"); NumModels = Args.NumModels ?? (predictorFactories.Length == 1 ? DefaultNumModels : predictorFactories.Length); ch.CheckUserArg(NumModels > 0, nameof(Args.NumModels), "Must be positive, or null to indicate numModels is the number of base predictors"); if (Utils.Size(predictorFactories) > NumModels) { ch.Warning("The base predictor count is greater than models count. Some of the base predictors will be ignored."); } _subsetSelector = Args.SamplingType.CreateComponent(Host); Trainers = new ITrainer <IPredictorProducing <TOutput> > [NumModels]; for (int i = 0; i < Trainers.Length; i++) { Trainers[i] = predictorFactories[i % predictorFactories.Length].CreateComponent(Host); } // We infer normalization and calibration preferences from the trainers. However, even if the internal trainers // don't need caching we are performing multiple passes over the data, so it is probably appropriate to always cache. Info = new TrainerInfo( normalization: Trainers.Any(t => t.Info.NeedNormalization), calibration: Trainers.Any(t => t.Info.NeedCalibration)); } }
public TransmissionRequest(string method, ArgumentsBase arguments) { Method = method; Arguments = arguments.Data; }
/// <summary> /// Utility method to create the file-based <see cref="TermMap"/> if the <see cref="ArgumentsBase.DataFile"/> /// argument of <paramref name="args"/> was present. /// </summary> private static TermMap CreateFileTermMap(IHostEnvironment env, IChannel ch, ArgumentsBase args, Builder bldr) { Contracts.AssertValue(ch); ch.AssertValue(env); ch.AssertValue(args); ch.Assert(!string.IsNullOrWhiteSpace(args.DataFile)); ch.AssertValue(bldr); string file = args.DataFile; // First column using the file. string src = args.TermsColumn; var sub = args.Loader; // If the user manually specifies a loader, or this is already a pre-processed binary // file, then we assume the user knows what they're doing and do not attempt to convert // to the desired type ourselves. bool autoConvert = false; if (!sub.IsGood()) { // Determine the default loader from the extension. var ext = Path.GetExtension(file); bool isBinary = string.Equals(ext, ".idv", StringComparison.OrdinalIgnoreCase); bool isTranspose = string.Equals(ext, ".tdv", StringComparison.OrdinalIgnoreCase); if (isBinary || isTranspose) { ch.Assert(isBinary != isTranspose); ch.CheckUserArg(!string.IsNullOrWhiteSpace(src), nameof(args.TermsColumn), "Must be specified"); if (isBinary) { sub = new SubComponent <IDataLoader, SignatureDataLoader>("BinaryLoader"); } else { ch.Assert(isTranspose); sub = new SubComponent <IDataLoader, SignatureDataLoader>("TransposeLoader"); } } else { if (!string.IsNullOrWhiteSpace(src)) { ch.Warning( "{0} should not be specified when default loader is TextLoader. Ignoring {0}={1}", nameof(Arguments.TermsColumn), src); } sub = new SubComponent <IDataLoader, SignatureDataLoader>("TextLoader", "sep=tab col=Term:TX:0"); src = "Term"; autoConvert = true; } } ch.AssertNonEmpty(src); int colSrc; var loader = sub.CreateInstance(env, new MultiFileSource(file)); if (!loader.Schema.TryGetColumnIndex(src, out colSrc)) { throw ch.ExceptUserArg(nameof(args.TermsColumn), "Unknown column '{0}'", src); } var typeSrc = loader.Schema.GetColumnType(colSrc); if (!autoConvert && !typeSrc.Equals(bldr.ItemType)) { throw ch.ExceptUserArg(nameof(args.TermsColumn), "Must be of type '{0}' but was '{1}'", bldr.ItemType, typeSrc); } using (var cursor = loader.GetRowCursor(col => col == colSrc)) using (var pch = env.StartProgressChannel("Building term dictionary from file")) { var header = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" }); var trainer = Trainer.Create(cursor, colSrc, autoConvert, int.MaxValue, bldr); double rowCount = loader.GetRowCount(true) ?? double.NaN; long rowCur = 0; pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); // Purely feedback for the user. That the other thread might be // working in the background is not a problem. e.SetMetric(0, trainer.Count); }); while (cursor.MoveNext() && trainer.ProcessRow()) { rowCur++; } if (trainer.Count == 0) { ch.Warning("Term map loaded from file resulted in an empty map."); } pch.Checkpoint(trainer.Count, rowCur); return(trainer.Finish()); } }
internal BaseScalarStacking(IHostEnvironment env, string name, ArgumentsBase args) : base(env, name, args) { }
protected RegressionLossEvaluatorBase(ArgumentsBase args, IHostEnvironment env, string registrationName) : base(env, registrationName) { Host.CheckUserArg(args.LossFunction != null, nameof(args.LossFunction), "Loss function must be specified."); LossFunction = args.LossFunction.CreateComponent(env); }
private void LoadStopWords(IHostEnvironment env, IChannel ch, ArgumentsBase loaderArgs, out NormStr.Pool stopWordsMap) { Contracts.AssertValue(env); env.AssertValue(ch); ch.AssertValue(loaderArgs); if ((!string.IsNullOrEmpty(loaderArgs.Stopwords) || Utils.Size(loaderArgs.Stopword) > 0) && (!string.IsNullOrWhiteSpace(loaderArgs.DataFile) || loaderArgs.Loader != null || !string.IsNullOrWhiteSpace(loaderArgs.StopwordsColumn))) { ch.Warning("Explicit stopwords list specified. Data file arguments will be ignored"); } var src = default(ReadOnlyMemory <char>); stopWordsMap = new NormStr.Pool(); var buffer = new StringBuilder(); var stopwords = loaderArgs.Stopwords.AsMemory(); stopwords = ReadOnlyMemoryUtils.TrimSpaces(stopwords); if (!stopwords.IsEmpty) { bool warnEmpty = true; for (bool more = true; more;) { ReadOnlyMemory <char> stopword; more = ReadOnlyMemoryUtils.SplitOne(stopwords, ',', out stopword, out stopwords); stopword = ReadOnlyMemoryUtils.TrimSpaces(stopword); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopwords' specification"); warnEmpty = false; } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopwords), "stopwords is empty"); } else if (Utils.Size(loaderArgs.Stopword) > 0) { bool warnEmpty = true; foreach (string word in loaderArgs.Stopword) { var stopword = word.AsSpan(); stopword = stopword.Trim(' '); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopword' specification"); warnEmpty = false; } } } else { string srcCol = loaderArgs.StopwordsColumn; var loader = LoadStopwords(env, ch, loaderArgs.DataFile, loaderArgs.Loader, ref srcCol); int colSrc; if (!loader.Schema.TryGetColumnIndex(srcCol, out colSrc)) { throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol); } var typeSrc = loader.Schema[colSrc].Type; ch.CheckUserArg(typeSrc.IsText, nameof(Arguments.StopwordsColumn), "Must be a scalar text column"); // Accumulate the stopwords. using (var cursor = loader.GetRowCursor(col => col == colSrc)) { bool warnEmpty = true; var getter = cursor.GetGetter <ReadOnlyMemory <char> >(colSrc); while (cursor.MoveNext()) { getter(ref src); if (!src.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty rows ignored in data file"); warnEmpty = false; } } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty"); } }
public ColInfoEx(ArgumentsBase args) { Separators = PredictionUtil.SeparatorFromString(args.TermSeparators); Contracts.CheckUserArg(Utils.Size(Separators) > 0, nameof(args.TermSeparators)); }
private protected SequentialAnomalyDetectionTransformBase(ArgumentsBase args, string name, IHostEnvironment env) : this(args.WindowSize, args.InitialWindowSize, args.Source, args.Name, name, env, args.Side, args.Martingale, args.AlertOn, args.PowerMartingaleEpsilon, args.AlertThreshold) { }
protected SweeperBase(ArgumentsBase args, IValueGenerator[] sweepParameters, string name) { _args = args; SweepParameters = sweepParameters; }
protected SweeperBase(ArgumentsBase args, string name) { _args = args; SweepParameters = args.SweptParameters.ToArray(); }
// -- methods public abstract RuntimeBase With(ArgumentsBase args);
public void SetArguments(string id, ArgumentsBase args) { this[id].With(args); }
public static void SetByGuid(this ArgumentsBase[] args, string guid, ArgumentsBase value) { var index = args.GetIndexByGuid(guid); args[index] = value; }
public UniformRandomSweeper(ArgumentsBase args) : base(args, "UniformRandom") { }
protected BaseBestPerformanceSelector(ArgumentsBase args, IHostEnvironment env, string name) : base(args, env, name) { }
public UniformRandomSweeper(ArgumentsBase args, IValueGenerator[] sweepParameters) : base(args, sweepParameters, "UniformRandom") { }
public abstract ISimulatable RegisterSimulator(ISimulatable simulator, ArgumentsBase args);
// -- methods /// <summary> /// The arguments must be of type RuntimeArgs. /// </summary> /// <param name="args"></param> /// <returns></returns> public override RuntimeBase With(ArgumentsBase args) { _args = args as RuntimeArgs; return(this); }
public IidAnomalyDetectionBase(ArgumentsBase args, string name, IHostEnvironment env, IDataView input) : base(args, name, env, input) { InitialWindowSize = 0; }
public UniformRandomSweeper(IHostEnvironment env, ArgumentsBase args) : base(args, env, "UniformRandom") { }
internal BaseMultiAverager(IHostEnvironment env, string name, ArgumentsBase args) : base(env, name, args) { }
public UniformRandomSweeper(IHostEnvironment env, ArgumentsBase args, IValueGenerator[] sweepParameters) : base(args, env, sweepParameters, "UniformRandom") { }
private protected RegressionEvaluatorBase(ArgumentsBase args, IHostEnvironment env, string registrationName) : base(args, env, registrationName) { }
// -- constructor public SimulatorEventArgs(ArgumentsBase arguments) { Timestamp = DateTime.Now; Arguments = arguments; }
private NormalizeTransform(IHost host, ArgumentsBase args, IDataView input, Func <int, int, ColumnType, IRowCursor, IColumnFunctionBuilder> fnCreate, params int[] extraTrainColumnIds) : base(host, host.CheckRef(args, nameof(args)).GetColumns(), input, args.TestType) { Host.AssertNonEmpty(Infos); Host.Assert(Utils.Size(Infos) == Utils.Size(args.GetColumns())); bool[] activeInput = new bool[Source.Schema.ColumnCount]; if (Utils.Size(extraTrainColumnIds) > 0) { foreach (var colId in extraTrainColumnIds) { Host.Assert(0 <= colId && colId < activeInput.Length); activeInput[colId] = true; } } foreach (var info in Infos) { activeInput[info.Source] = true; } var functionBuilders = new IColumnFunctionBuilder[Infos.Length]; var needMoreData = new bool[Infos.Length]; // Go through the input data and pass it to the column function builders. using (var pch = Host.StartProgressChannel("Normalize")) { long numRows = 0; pch.SetHeader(new ProgressHeader("examples"), e => e.SetProgress(0, numRows)); using (var cursor = Source.GetRowCursor(col => activeInput[col])) { for (int i = 0; i < Infos.Length; i++) { needMoreData[i] = true; var info = Infos[i]; functionBuilders[i] = fnCreate(i, info.Source, info.TypeSrc, cursor); } while (cursor.MoveNext()) { // If the row has bad values, the good values are still being used for training. // The comparisons in the code below are arranged so that NaNs in the input are not recorded. // REVIEW: Should infinities and/or NaNs be filtered before the normalization? Should we not record infinities for min/max? // Currently, infinities are recorded and will result in zero scale which in turn will result in NaN output for infinity input. bool any = false; for (int i = 0; i < Infos.Length; i++) { if (!needMoreData[i]) { continue; } var info = Infos[i]; Host.Assert(!info.TypeSrc.IsVector || info.TypeSrc.IsVector && info.TypeSrc.IsKnownSizeVector); Host.Assert(functionBuilders[i] != null); any |= needMoreData[i] = functionBuilders[i].ProcessValue(); } numRows++; if (!any) { break; } } } pch.Checkpoint(numRows); _functions = new IColumnFunction[Infos.Length]; for (int i = 0; i < Infos.Length; i++) { _functions[i] = functionBuilders[i].CreateColumnFunction(); } } SetMetadata(); }
public static void SetByName(this ArgumentsBase[] args, string name, ArgumentsBase value) { var index = args.GetIndexByName(name); args[index] = value; }
/// <summary> /// This builds the <see cref="TermMap"/> instances per column. /// </summary> private static TermMap[] Train(IHostEnvironment env, IChannel ch, ColInfo[] infos, ArgumentsBase args, ColumnBase[] column, IDataView trainingData) { Contracts.AssertValue(env); env.AssertValue(ch); ch.AssertValue(infos); ch.AssertValue(args); ch.AssertValue(column); ch.AssertValue(trainingData); if ((args.Term != null || !string.IsNullOrEmpty(args.Terms)) && (!string.IsNullOrWhiteSpace(args.DataFile) || args.Loader.IsGood() || !string.IsNullOrWhiteSpace(args.TermsColumn))) { ch.Warning("Explicit term list specified. Data file arguments will be ignored"); } if (!Enum.IsDefined(typeof(SortOrder), args.Sort)) { throw ch.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected", args.Sort); } TermMap termsFromFile = null; var termMap = new TermMap[infos.Length]; int[] lims = new int[infos.Length]; int trainsNeeded = 0; HashSet <int> toTrain = null; for (int iinfo = 0; iinfo < infos.Length; iinfo++) { // First check whether we have a terms argument, and handle it appropriately. var terms = new DvText(column[iinfo].Terms); var termsArray = column[iinfo].Term; if (!terms.HasChars && termsArray == null) { terms = new DvText(args.Terms); termsArray = args.Term; } terms = terms.Trim(); if (terms.HasChars || (termsArray != null && termsArray.Length > 0)) { // We have terms! Pass it in. var sortOrder = column[iinfo].Sort ?? args.Sort; if (!Enum.IsDefined(typeof(SortOrder), sortOrder)) { throw ch.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected for column '{1}'", sortOrder, infos[iinfo].Name); } var bldr = Builder.Create(infos[iinfo].TypeSrc, sortOrder); if (terms.HasChars) { bldr.ParseAddTermArg(ref terms, ch); } else { bldr.ParseAddTermArg(termsArray, ch); } termMap[iinfo] = bldr.Finish(); } else if (!string.IsNullOrWhiteSpace(args.DataFile)) { // First column using this file. if (termsFromFile == null) { var bldr = Builder.Create(infos[iinfo].TypeSrc, column[iinfo].Sort ?? args.Sort); termsFromFile = CreateFileTermMap(env, ch, args, bldr); } if (!termsFromFile.ItemType.Equals(infos[iinfo].TypeSrc.ItemType)) { // We have no current plans to support re-interpretation based on different column // type, not only because it's unclear what realistic customer use-cases for such // a complicated feature would be, and also because it's difficult to see how we // can logically reconcile "reinterpretation" for different types with the resulting // data view having an actual type. throw ch.ExceptUserArg(nameof(args.DataFile), "Data file terms loaded as type '{0}' but mismatches column '{1}' item type '{2}'", termsFromFile.ItemType, infos[iinfo].Name, infos[iinfo].TypeSrc.ItemType); } termMap[iinfo] = termsFromFile; } else { // Auto train this column. Leave the term map null for now, but set the lim appropriately. lims[iinfo] = column[iinfo].MaxNumTerms ?? args.MaxNumTerms; ch.CheckUserArg(lims[iinfo] > 0, nameof(Column.MaxNumTerms), "Must be positive"); Utils.Add(ref toTrain, infos[iinfo].Source); ++trainsNeeded; } } ch.Assert((Utils.Size(toTrain) == 0) == (trainsNeeded == 0)); ch.Assert(Utils.Size(toTrain) <= trainsNeeded); if (trainsNeeded > 0) { Trainer[] trainer = new Trainer[trainsNeeded]; int[] trainerInfo = new int[trainsNeeded]; // Open the cursor, then instantiate the trainers. int itrainer; using (var cursor = trainingData.GetRowCursor(toTrain.Contains)) using (var pch = env.StartProgressChannel("Building term dictionary")) { long rowCur = 0; double rowCount = trainingData.GetRowCount(true) ?? double.NaN; var header = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" }); itrainer = 0; for (int iinfo = 0; iinfo < infos.Length; ++iinfo) { if (termMap[iinfo] != null) { continue; } var bldr = Builder.Create(infos[iinfo].TypeSrc, column[iinfo].Sort ?? args.Sort); trainerInfo[itrainer] = iinfo; trainer[itrainer++] = Trainer.Create(cursor, infos[iinfo].Source, false, lims[iinfo], bldr); } ch.Assert(itrainer == trainer.Length); pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); // Purely feedback for the user. That the other thread might be // working in the background is not a problem. e.SetMetric(0, trainer.Sum(t => t.Count)); }); // The [0,tmin) trainers are finished. int tmin = 0; // We might exit early if all trainers reach their maximum. while (tmin < trainer.Length && cursor.MoveNext()) { rowCur++; for (int t = tmin; t < trainer.Length; ++t) { if (!trainer[t].ProcessRow()) { Utils.Swap(ref trainerInfo[t], ref trainerInfo[tmin]); Utils.Swap(ref trainer[t], ref trainer[tmin++]); } } } pch.Checkpoint(trainer.Sum(t => t.Count), rowCur); } for (itrainer = 0; itrainer < trainer.Length; ++itrainer) { int iinfo = trainerInfo[itrainer]; ch.Assert(termMap[iinfo] == null); if (trainer[itrainer].Count == 0) { ch.Warning("Term map for output column '{0}' contains no entries.", infos[iinfo].Name); } termMap[iinfo] = trainer[itrainer].Finish(); // Allow the intermediate structures in the trainer and builder to be released as we iterate // over the columns, as the Finish operation can potentially result in the allocation of // additional structures. trainer[itrainer] = null; } ch.Assert(termMap.All(tm => tm != null)); ch.Assert(termMap.Zip(infos, (tm, info) => tm.ItemType.Equals(info.TypeSrc.ItemType)).All(x => x)); } return(termMap); }
/// <summary> /// Adds a concrete argument object to the simulator /// </summary> /// <param name="arguments">the arguments object, /// can be added several times depending on the needs of the simulator</param> /// <returns>the calling instance</returns> public abstract ISimulatable With(ArgumentsBase arguments);