private protected abstract TModel TrainCore(IChannel ch, RoleMappedData data, LinearModelParameters predictor, int weightSetCount);
protected override void CheckLabel(RoleMappedData data) { Contracts.AssertValue(data); data.CheckBinaryLabel(); }
private FieldAwareFactorizationMachineModelParameters TrainCore(IChannel ch, IProgressChannel pch, RoleMappedData data, RoleMappedData validData = null, FieldAwareFactorizationMachineModelParameters predictor = null) { Host.AssertValue(ch); Host.AssertValue(pch); data.CheckBinaryLabel(); var featureColumns = data.Schema.GetColumns(RoleMappedSchema.ColumnRole.Feature); int fieldCount = featureColumns.Count; int totalFeatureCount = 0; int[] fieldColumnIndexes = new int[fieldCount]; for (int f = 0; f < fieldCount; f++) { var col = featureColumns[f]; Host.Assert(!col.IsHidden); if (!(col.Type is VectorType vectorType) || !vectorType.IsKnownSize || vectorType.ItemType != NumberType.Float) { throw ch.ExceptParam(nameof(data), "Training feature column '{0}' must be a known-size vector of R4, but has type: {1}.", col.Name, col.Type); } Host.Assert(vectorType.Size > 0); fieldColumnIndexes[f] = col.Index; totalFeatureCount += vectorType.Size; } ch.Check(checked (totalFeatureCount * fieldCount * _latentDimAligned) <= Utils.ArrayMaxSize, "Latent dimension or the number of fields too large"); if (predictor != null) { ch.Check(predictor.FeatureCount == totalFeatureCount, "Input model's feature count mismatches training feature count"); ch.Check(predictor.LatentDim == _latentDim, "Input model's latent dimension mismatches trainer's"); } if (validData != null) { validData.CheckBinaryLabel(); var validFeatureColumns = data.Schema.GetColumns(RoleMappedSchema.ColumnRole.Feature); Host.Assert(fieldCount == validFeatureColumns.Count); for (int f = 0; f < fieldCount; f++) { var featCol = featureColumns[f]; var validFeatCol = validFeatureColumns[f]; Host.Assert(featCol.Name == validFeatCol.Name); Host.Assert(featCol.Type == validFeatCol.Type); } } bool shuffle = _shuffle; if (shuffle && !data.Data.CanShuffle) { ch.Warning("Training data does not support shuffling, so ignoring request to shuffle"); shuffle = false; } var rng = shuffle ? Host.Rand : null; var featureGetters = new ValueGetter <VBuffer <float> > [fieldCount]; var featureBuffer = new VBuffer <float>(); var featureValueBuffer = new float[totalFeatureCount]; var featureIndexBuffer = new int[totalFeatureCount]; var featureFieldBuffer = new int[totalFeatureCount]; var latentSum = new AlignedArray(fieldCount * fieldCount * _latentDimAligned, 16); var metricNames = new List <string>() { "Training-loss" }; if (validData != null) { metricNames.Add("Validation-loss"); } int iter = 0; long exampleCount = 0; long badExampleCount = 0; long validBadExampleCount = 0; double loss = 0; double validLoss = 0; pch.SetHeader(new ProgressHeader(metricNames.ToArray(), new string[] { "iterations", "examples" }), entry => { entry.SetProgress(0, iter, _numIterations); entry.SetProgress(1, exampleCount); }); Func <int, bool> pred = c => fieldColumnIndexes.Contains(c) || c == data.Schema.Label.Value.Index || c == data.Schema.Weight?.Index; InitializeTrainingState(fieldCount, totalFeatureCount, predictor, out float[] linearWeights, out AlignedArray latentWeightsAligned, out float[] linearAccSqGrads, out AlignedArray latentAccSqGradsAligned); // refer to Algorithm 3 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf while (iter++ < _numIterations) { using (var cursor = data.Data.GetRowCursor(pred, rng)) { var labelGetter = RowCursorUtils.GetLabelGetter(cursor, data.Schema.Label.Value.Index); var weightGetter = data.Schema.Weight?.Index is int weightIdx?RowCursorUtils.GetGetterAs <float>(NumberType.R4, cursor, weightIdx) : null; for (int i = 0; i < fieldCount; i++) { featureGetters[i] = cursor.GetGetter <VBuffer <float> >(fieldColumnIndexes[i]); } loss = 0; exampleCount = 0; badExampleCount = 0; while (cursor.MoveNext()) { float label = 0; float weight = 1; int count = 0; float modelResponse = 0; labelGetter(ref label); weightGetter?.Invoke(ref weight); float annihilation = label - label + weight - weight; if (!FloatUtils.IsFinite(annihilation)) { badExampleCount++; continue; } if (!FieldAwareFactorizationMachineUtils.LoadOneExampleIntoBuffer(featureGetters, featureBuffer, _norm, ref count, featureFieldBuffer, featureIndexBuffer, featureValueBuffer)) { badExampleCount++; continue; } // refer to Algorithm 1 in [3] https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf FieldAwareFactorizationMachineInterface.CalculateIntermediateVariables(fieldCount, _latentDimAligned, count, featureFieldBuffer, featureIndexBuffer, featureValueBuffer, linearWeights, latentWeightsAligned, latentSum, ref modelResponse); var slope = CalculateLossSlope(label, modelResponse); // refer to Algorithm 2 in [3] https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf FieldAwareFactorizationMachineInterface.CalculateGradientAndUpdate(_lambdaLinear, _lambdaLatent, _learningRate, fieldCount, _latentDimAligned, weight, count, featureFieldBuffer, featureIndexBuffer, featureValueBuffer, latentSum, slope, linearWeights, latentWeightsAligned, linearAccSqGrads, latentAccSqGradsAligned); loss += weight * CalculateLoss(label, modelResponse); exampleCount++; } loss /= exampleCount; } if (_verbose) { if (validData == null) { pch.Checkpoint(loss, iter, exampleCount); } else { validLoss = CalculateAvgLoss(ch, validData, _norm, linearWeights, latentWeightsAligned, _latentDimAligned, latentSum, featureFieldBuffer, featureIndexBuffer, featureValueBuffer, featureBuffer, ref validBadExampleCount); pch.Checkpoint(loss, validLoss, iter, exampleCount); } } } if (badExampleCount != 0) { ch.Warning($"Skipped {badExampleCount} examples with bad label/weight/features in training set"); } if (validBadExampleCount != 0) { ch.Warning($"Skipped {validBadExampleCount} examples with bad label/weight/features in validation set"); } return(new FieldAwareFactorizationMachineModelParameters(Host, _norm, fieldCount, totalFeatureCount, _latentDim, linearWeights, latentWeightsAligned)); }
private protected IDataView MapLabelsCore <T>(ColumnType type, InPredicate <T> equalsTarget, RoleMappedData data) { Host.AssertValue(type); Host.Assert(type.RawType == typeof(T)); Host.AssertValue(equalsTarget); Host.AssertValue(data); Host.Assert(data.Schema.Label.HasValue); var lab = data.Schema.Label.Value; InPredicate <T> isMissing; if (!Args.ImputeMissingLabelsAsNegative && Conversions.Instance.TryGetIsNAPredicate(type, out isMissing)) { return(LambdaColumnMapper.Create(Host, "Label mapper", data.Data, lab.Name, lab.Name, type, NumberType.Float, (in T src, ref float dst) => dst = equalsTarget(in src) ? 1 : (isMissing(in src) ? float.NaN : default(float)))); } return(LambdaColumnMapper.Create(Host, "Label mapper", data.Data, lab.Name, lab.Name, type, NumberType.Float, (in T src, ref float dst) => dst = equalsTarget(in src) ? 1 : default(float))); }
private static void GetPipeline(IHostEnvironment env, InputBase input, out IDataView startingData, out RoleMappedData transformedData) { Contracts.AssertValue(env); env.AssertValue(input); env.AssertNonEmpty(input.Models); ISchema inputSchema = null; startingData = null; transformedData = null; byte[][] transformedDataSerialized = null; string[] transformedDataZipEntryNames = null; for (int i = 0; i < input.Models.Length; i++) { var model = input.Models[i]; var inputData = new EmptyDataView(env, model.TransformModel.InputSchema); model.PrepareData(env, inputData, out RoleMappedData transformedDataCur, out IPredictor pred); if (inputSchema == null) { env.Assert(i == 0); inputSchema = model.TransformModel.InputSchema; startingData = inputData; transformedData = transformedDataCur; } else if (input.ValidatePipelines) { using (var ch = env.Start("Validating pipeline")) { if (transformedDataSerialized == null) { ch.Assert(transformedDataZipEntryNames == null); SerializeRoleMappedData(env, ch, transformedData, out transformedDataSerialized, out transformedDataZipEntryNames); } CheckSamePipeline(env, ch, transformedDataCur, transformedDataSerialized, transformedDataZipEntryNames); ch.Done(); } } } }
private TPredictor TrainCore(IChannel ch, RoleMappedData data, LinearModelParameters predictor, int weightSetCount) { int numFeatures = data.Schema.Feature.Value.Type.GetVectorSize(); var cursorFactory = new FloatLabelCursor.Factory(data, CursOpt.Label | CursOpt.Features); int numThreads = 1; ch.CheckUserArg(numThreads > 0, nameof(_options.NumberOfThreads), "The number of threads must be either null or a positive integer."); var positiveInstanceWeight = _options.PositiveInstanceWeight; VBuffer <float> weights = default; float bias = 0.0f; if (predictor != null) { predictor.GetFeatureWeights(ref weights); VBufferUtils.Densify(ref weights); bias = predictor.Bias; } else { weights = VBufferUtils.CreateDense <float>(numFeatures); } var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights); // Reference: Parasail. SymSGD. bool tuneLR = _options.LearningRate == null; var lr = _options.LearningRate ?? 1.0f; bool tuneNumLocIter = (_options.UpdateFrequency == null); var numLocIter = _options.UpdateFrequency ?? 1; var l2Const = _options.L2Regularization; var piw = _options.PositiveInstanceWeight; // This is state of the learner that is shared with the native code. State state = new State(); GCHandle stateGCHandle = default; try { stateGCHandle = GCHandle.Alloc(state, GCHandleType.Pinned); state.TotalInstancesProcessed = 0; using (InputDataManager inputDataManager = new InputDataManager(this, cursorFactory, ch)) { bool shouldInitialize = true; using (var pch = Host.StartProgressChannel("Preprocessing")) inputDataManager.LoadAsMuchAsPossible(); int iter = 0; if (inputDataManager.IsFullyLoaded) { ch.Info("Data fully loaded into memory."); } using (var pch = Host.StartProgressChannel("Training")) { if (inputDataManager.IsFullyLoaded) { pch.SetHeader(new ProgressHeader(new[] { "iterations" }), entry => entry.SetProgress(0, state.PassIteration, _options.NumberOfIterations)); // If fully loaded, call the SymSGDNative and do not come back until learned for all iterations. Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, _options.NumberOfIterations, numThreads, tuneNumLocIter, ref numLocIter, _options.Tolerance, _options.Shuffle, shouldInitialize, stateGCHandle, ch.Info); shouldInitialize = false; } else { pch.SetHeader(new ProgressHeader(new[] { "iterations" }), entry => entry.SetProgress(0, iter, _options.NumberOfIterations)); // Since we loaded data in batch sizes, multiple passes over the loaded data is feasible. int numPassesForABatch = inputDataManager.Count / 10000; while (iter < _options.NumberOfIterations) { // We want to train on the final passes thoroughly (without learning on the same batch multiple times) // This is for fine tuning the AUC. Experimentally, we found that 1 or 2 passes is enough int numFinalPassesToTrainThoroughly = 2; // We also do not want to learn for more passes than what the user asked int numPassesForThisBatch = Math.Min(numPassesForABatch, _options.NumberOfIterations - iter - numFinalPassesToTrainThoroughly); // If all of this leaves us with 0 passes, then set numPassesForThisBatch to 1 numPassesForThisBatch = Math.Max(1, numPassesForThisBatch); state.PassIteration = iter; Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, numPassesForThisBatch, numThreads, tuneNumLocIter, ref numLocIter, _options.Tolerance, _options.Shuffle, shouldInitialize, stateGCHandle, ch.Info); shouldInitialize = false; // Check if we are done with going through the data if (inputDataManager.FinishedTheLoad) { iter += numPassesForThisBatch; // Check if more passes are left if (iter < _options.NumberOfIterations) { inputDataManager.RestartLoading(_options.Shuffle, Host); } } // If more passes are left, load as much as possible if (iter < _options.NumberOfIterations) { inputDataManager.LoadAsMuchAsPossible(); } } } // Maps back the dense features that are mislocated if (numThreads > 1) { Native.MapBackWeightVector(weightsEditor.Values, stateGCHandle); } Native.DeallocateSequentially(stateGCHandle); } } } finally { if (stateGCHandle.IsAllocated) { stateGCHandle.Free(); } } return(CreatePredictor(weights, bias)); }
private protected override void CheckLabel(RoleMappedData examples, out int weightSetCount) { examples.CheckMulticlassLabel(out weightSetCount); }
private protected override void CheckAndUpdateParametersBeforeTraining(IChannel ch, RoleMappedData data, float[] labels, int[] groups) { Host.AssertValue(ch); GbmOptions["objective"] = "lambdarank"; ch.CheckValue(groups, nameof(groups)); // Only output one ndcg score. GbmOptions["eval_at"] = "5"; }
/// <summary> /// Constructor, given a training set and optional other arguments. /// </summary> /// <param name="trainingSet">Will set <see cref="TrainingSet"/> to this value. This must be specified</param> /// <param name="validationSet">Will set <see cref="ValidationSet"/> to this value if specified</param> /// <param name="testSet">Will set <see cref="TestSet"/> to this value if specified</param> /// <param name="initialPredictor">Will set <see cref="InitialPredictor"/> to this value if specified</param> public TrainContext(RoleMappedData trainingSet, RoleMappedData validationSet = null, RoleMappedData testSet = null, IPredictor initialPredictor = null) { Contracts.CheckValue(trainingSet, nameof(trainingSet)); Contracts.CheckValueOrNull(validationSet); Contracts.CheckValueOrNull(initialPredictor); // REVIEW: Should there be code here to ensure that the role mappings between the two are compatible? // That is, all the role mappings are the same and the columns between them have identical types? TrainingSet = trainingSet; ValidationSet = validationSet; TestSet = testSet; InitialPredictor = initialPredictor; }
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // x86 fails with "An attempt was made to load a program with an incorrect format." void TestOldSavingAndLoading() { if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { return; } var modelFile = "squeezenet/00000001/model.onnx"; var samplevector = GetSampleArrayData(); var dataView = ComponentCreation.CreateDataView(Env, new TestData[] { new TestData() { data_0 = samplevector } }); var inputNames = new[] { "data_0" }; var outputNames = new[] { "softmaxout_1" }; var est = new OnnxScoringEstimator(Env, modelFile, inputNames, outputNames); var transformer = est.Fit(dataView); var result = transformer.Transform(dataView); var resultRoles = new RoleMappedData(result); using (var ms = new MemoryStream()) { TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles); ms.Position = 0; var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms); loadedView.Schema.TryGetColumnIndex(outputNames[0], out int softMaxOut1); using (var cursor = loadedView.GetRowCursor(col => col == softMaxOut1)) { VBuffer <float> softMaxValue = default; var softMaxGetter = cursor.GetGetter <VBuffer <float> >(softMaxOut1); float sum = 0f; int i = 0; while (cursor.MoveNext()) { softMaxGetter(ref softMaxValue); var values = softMaxValue.DenseValues(); foreach (var val in values) { sum += val; if (i == 0) { Assert.InRange(val, 0.00004, 0.00005); } if (i == 1) { Assert.InRange(val, 0.003844, 0.003845); } if (i == 999) { Assert.InRange(val, 0.0029566, 0.0029567); } i++; } } Assert.InRange(sum, 1.0, 1.00001); } } }
private PcaPredictor TrainCore(IChannel ch, RoleMappedData data, int dimension) { Host.AssertValue(ch); ch.AssertValue(data); if (_rank > dimension) { throw ch.Except("Rank ({0}) cannot be larger than the original dimension ({1})", _rank, dimension); } int oversampledRank = Math.Min(_rank + _oversampling, dimension); //exact: (size of the 2 big matrices + other minor allocations) / (2^30) Double memoryUsageEstimate = 2.0 * dimension * oversampledRank * sizeof(Float) / 1e9; if (memoryUsageEstimate > 2) { ch.Info("Estimate memory usage: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", memoryUsageEstimate); } var y = Zeros(oversampledRank, dimension); var mean = _center ? VBufferUtils.CreateDense <Float>(dimension) : VBufferUtils.CreateEmpty <Float>(dimension); var omega = GaussianMatrix(oversampledRank, dimension, _seed); var cursorFactory = new FeatureFloatVectorCursor.Factory(data, CursOpt.Features | CursOpt.Weight); long numBad; Project(Host, cursorFactory, ref mean, omega, y, out numBad); if (numBad > 0) { ch.Warning("Skipped {0} instances with missing features/weights during training", numBad); } //Orthonormalize Y in-place using stabilized Gram Schmidt algorithm. //Ref: https://en.wikipedia.org/wiki/Gram-Schmidt#Algorithm for (var i = 0; i < oversampledRank; ++i) { var v = y[i]; VectorUtils.ScaleBy(ref v, 1 / VectorUtils.Norm(y[i])); // Make the next vectors in the queue orthogonal to the orthonormalized vectors. for (var j = i + 1; j < oversampledRank; ++j) //subtract the projection of y[j] on v. { VectorUtils.AddMult(ref v, -VectorUtils.DotProduct(ref v, ref y[j]), ref y[j]); } } var q = y; // q in QR decomposition. var b = omega; // reuse the memory allocated by Omega. Project(Host, cursorFactory, ref mean, q, b, out numBad); //Compute B2 = B' * B var b2 = new Float[oversampledRank * oversampledRank]; for (var i = 0; i < oversampledRank; ++i) { for (var j = i; j < oversampledRank; ++j) { b2[i * oversampledRank + j] = b2[j * oversampledRank + i] = VectorUtils.DotProduct(ref b[i], ref b[j]); } } Float[] smallEigenvalues;// eigenvectors and eigenvalues of the small matrix B2. Float[] smallEigenvectors; EigenUtils.EigenDecomposition(b2, out smallEigenvalues, out smallEigenvectors); PostProcess(b, smallEigenvalues, smallEigenvectors, dimension, oversampledRank); return(new PcaPredictor(Host, _rank, b, ref mean)); }
protected override void CheckLabel(RoleMappedData examples) { examples.CheckRegressionLabel(); }
private protected override void CheckLabel(RoleMappedData data) { Contracts.AssertValue(data); data.CheckRegressionLabel(); }
protected override void CheckLabel(RoleMappedData examples, out int weightSetCount) { examples.CheckRegressionLabel(); weightSetCount = 1; }
private TPredictor TrainCore(IChannel ch, RoleMappedData data) { Host.AssertValue(ch); ch.AssertValue(data); // 1. Subset Selection var stackingTrainer = Combiner as IStackingTrainer <TOutput>; //REVIEW: Implement stacking for Batch mode. ch.CheckUserArg(stackingTrainer == null || Args.BatchSize <= 0, nameof(Args.BatchSize), "Stacking works only with Non-batch mode"); var validationDataSetProportion = SubModelSelector.ValidationDatasetProportion; if (stackingTrainer != null) { validationDataSetProportion = Math.Max(validationDataSetProportion, stackingTrainer.ValidationDatasetProportion); } var needMetrics = Args.ShowMetrics || Combiner is IWeightedAverager; var models = new List <FeatureSubsetModel <TOutput> >(); _subsetSelector.Initialize(data, NumModels, Args.BatchSize, validationDataSetProportion); int batchNumber = 1; foreach (var batch in _subsetSelector.GetBatches(Host.Rand)) { // 2. Core train ch.Info("Training {0} learners for the batch {1}", Trainers.Length, batchNumber++); var batchModels = new FeatureSubsetModel <TOutput> [Trainers.Length]; Parallel.ForEach(_subsetSelector.GetSubsets(batch, Host.Rand), new ParallelOptions() { MaxDegreeOfParallelism = Args.TrainParallel ? -1 : 1 }, (subset, state, index) => { ch.Info("Beginning training model {0} of {1}", index + 1, Trainers.Length); Stopwatch sw = Stopwatch.StartNew(); try { if (EnsureMinimumFeaturesSelected(subset)) { var model = new FeatureSubsetModel <TOutput>( Trainers[(int)index].Train(subset.Data), subset.SelectedFeatures, null); SubModelSelector.CalculateMetrics(model, _subsetSelector, subset, batch, needMetrics); batchModels[(int)index] = model; } } catch (Exception ex) { ch.Assert(batchModels[(int)index] == null); ch.Warning(ex.Sensitivity(), "Trainer {0} of {1} was not learned properly due to the exception '{2}' and will not be added to models.", index + 1, Trainers.Length, ex.Message); } ch.Info("Trainer {0} of {1} finished in {2}", index + 1, Trainers.Length, sw.Elapsed); }); var modelsList = batchModels.Where(m => m != null).ToList(); if (Args.ShowMetrics) { PrintMetrics(ch, modelsList); } modelsList = SubModelSelector.Prune(modelsList).ToList(); if (stackingTrainer != null) { stackingTrainer.Train(modelsList, _subsetSelector.GetTestData(null, batch), Host); } models.AddRange(modelsList); int modelSize = Utils.Size(models); if (modelSize < Utils.Size(Trainers)) { ch.Warning("{0} of {1} trainings failed.", Utils.Size(Trainers) - modelSize, Utils.Size(Trainers)); } ch.Check(modelSize > 0, "Ensemble training resulted in no valid models."); } return(CreatePredictor(models)); }
private protected override void CheckLabels(RoleMappedData data) { data.CheckRegressionLabel(); }
protected override void ConvertNaNLabels(IChannel ch, RoleMappedData data, float[] labels) { // Only initialize one time. if (_numClass < 0) { float minLabel = float.MaxValue; float maxLabel = float.MinValue; bool hasNaNLabel = false; foreach (var label in labels) { if (float.IsNaN(label)) { hasNaNLabel = true; } else { minLabel = Math.Min(minLabel, label); maxLabel = Math.Max(maxLabel, label); } } ch.CheckParam(minLabel >= 0, nameof(data), "min label cannot be negative"); if (maxLabel >= _maxNumClass) { throw ch.ExceptParam(nameof(data), $"max label cannot exceed {_maxNumClass}"); } if (data.Schema.Label.Type.IsKey) { ch.Check(data.Schema.Label.Type.AsKey.Contiguous, "label value should be contiguous"); if (hasNaNLabel) { _numClass = data.Schema.Label.Type.AsKey.Count + 1; } else { _numClass = data.Schema.Label.Type.AsKey.Count; } _tlcNumClass = data.Schema.Label.Type.AsKey.Count; } else { if (hasNaNLabel) { _numClass = (int)maxLabel + 2; } else { _numClass = (int)maxLabel + 1; } _tlcNumClass = (int)maxLabel + 1; } } float defaultLabel = _numClass - 1; for (int i = 0; i < labels.Length; ++i) { if (float.IsNaN(labels[i])) { labels[i] = defaultLabel; } } }
public Subset(RoleMappedData data, BitArray features = null) { Contracts.AssertValue(data); Data = data; SelectedFeatures = features; }
private void CheckLabel(RoleMappedData examples, out int weightSetCount) { examples.CheckBinaryLabel(); weightSetCount = 1; }
public static TOut Train <TArg, TOut>(IHost host, TArg input, Func <ITrainer> createTrainer, Func <string> getLabel = null, Func <string> getWeight = null, Func <string> getGroup = null, Func <string> getName = null, Func <IEnumerable <KeyValuePair <RoleMappedSchema.ColumnRole, string> > > getCustom = null, ICalibratorTrainerFactory calibrator = null, int maxCalibrationExamples = 0) where TArg : LearnerInputBase where TOut : CommonOutputs.TrainerOutput, new() { using (var ch = host.Start("Training")) { var schema = input.TrainingData.Schema; var feature = FindColumn(ch, schema, input.FeatureColumn); var label = getLabel?.Invoke(); var weight = getWeight?.Invoke(); var group = getGroup?.Invoke(); var name = getName?.Invoke(); var custom = getCustom?.Invoke(); var trainer = createTrainer(); IDataView view = input.TrainingData; TrainUtils.AddNormalizerIfNeeded(host, ch, trainer, ref view, feature, input.NormalizeFeatures); ch.Trace("Binding columns"); var roleMappedData = new RoleMappedData(view, label, feature, group, weight, name, custom); RoleMappedData cachedRoleMappedData = roleMappedData; Cache.CachingType?cachingType = null; switch (input.Caching) { case CachingOptions.Memory: { cachingType = Cache.CachingType.Memory; break; } case CachingOptions.Disk: { cachingType = Cache.CachingType.Disk; break; } case CachingOptions.Auto: { // REVIEW: we should switch to hybrid caching in future. if (!(input.TrainingData is BinaryLoader) && trainer.Info.WantCaching) { // default to Memory so mml is on par with maml cachingType = Cache.CachingType.Memory; } break; } case CachingOptions.None: break; default: throw ch.ExceptParam(nameof(input.Caching), "Unknown option for caching: '{0}'", input.Caching); } if (cachingType.HasValue) { var cacheView = Cache.CacheData(host, new Cache.CacheInput() { Data = roleMappedData.Data, Caching = cachingType.Value }).OutputData; cachedRoleMappedData = new RoleMappedData(cacheView, roleMappedData.Schema.GetColumnRoleNames()); } var predictor = TrainUtils.Train(host, ch, cachedRoleMappedData, trainer, calibrator, maxCalibrationExamples); return(new TOut() { PredictorModel = new PredictorModelImpl(host, roleMappedData, input.TrainingData, predictor) }); } }
private protected abstract TModel TrainCore(IChannel ch, RoleMappedData data, int count);
private protected override void ConvertNaNLabels(IChannel ch, RoleMappedData data, float[] labels) { // Only initialize one time. if (_numClass < 0) { float minLabel = float.MaxValue; float maxLabel = float.MinValue; bool hasNaNLabel = false; foreach (var labelColumn in labels) { if (float.IsNaN(labelColumn)) { hasNaNLabel = true; } else { minLabel = Math.Min(minLabel, labelColumn); maxLabel = Math.Max(maxLabel, labelColumn); } } ch.CheckParam(minLabel >= 0, nameof(data), "min labelColumn cannot be negative"); if (maxLabel >= _maxNumClass) { throw ch.ExceptParam(nameof(data), $"max labelColumn cannot exceed {_maxNumClass}"); } if (data.Schema.Label.Value.Type is KeyType keyType) { if (hasNaNLabel) { _numClass = keyType.GetCountAsInt32(Host) + 1; } else { _numClass = keyType.GetCountAsInt32(Host); } _tlcNumClass = keyType.GetCountAsInt32(Host); } else { if (hasNaNLabel) { _numClass = (int)maxLabel + 2; } else { _numClass = (int)maxLabel + 1; } _tlcNumClass = (int)maxLabel + 1; } } float defaultLabel = _numClass - 1; for (int i = 0; i < labels.Length; ++i) { if (float.IsNaN(labels[i])) { labels[i] = defaultLabel; } } }
/// <summary> /// This method takes a <see cref="RoleMappedData"/> as input, saves it as an in-memory <see cref="ZipArchive"/> /// and returns two arrays indexed by the entries in the zip: /// 1. An array of byte arrays, containing the byte sequences of each entry. /// 2. An array of strings, containing the name of each entry. /// /// This method is used for comparing pipelines. Its outputs can be passed to <see cref="CheckSamePipeline"/> /// to check if this pipeline is identical to another pipeline. /// </summary> public static void SerializeRoleMappedData(IHostEnvironment env, IChannel ch, RoleMappedData data, out byte[][] dataSerialized, out string[] dataZipEntryNames) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(ch, nameof(ch)); ch.CheckValue(data, nameof(data)); using (var ms = new MemoryStream()) { TrainUtils.SaveModel(env, ch, ms, null, data); var zip = new ZipArchive(ms); var entries = zip.Entries.OrderBy(e => e.FullName).ToArray(); dataSerialized = new byte[Utils.Size(entries)][]; dataZipEntryNames = new string[Utils.Size(entries)]; for (int i = 0; i < Utils.Size(entries); i++) { dataZipEntryNames[i] = entries[i].FullName; dataSerialized[i] = new byte[entries[i].Length]; using (var s = entries[i].Open()) s.Read(dataSerialized[i], 0, (int)entries[i].Length); } } }
private protected override void CheckAndUpdateParametersBeforeTraining(IChannel ch, RoleMappedData data, float[] labels, int[] groups) { Host.AssertValue(ch); ch.Assert(PredictionKind == PredictionKind.MultiClassClassification); ch.Assert(_numClass > 1); Options["num_class"] = _numClass; bool useSoftmax = false; if (Args.UseSoftmax.HasValue) { useSoftmax = Args.UseSoftmax.Value; } else { if (labels.Length >= _minDataToUseSoftmax) { useSoftmax = true; } ch.Info("Auto-tuning parameters: " + nameof(Args.UseSoftmax) + " = " + useSoftmax); } if (useSoftmax) { Options["objective"] = "multiclass"; } else { Options["objective"] = "multiclassova"; } // Add default metric. if (!Options.ContainsKey("metric")) { Options["metric"] = "multi_error"; } }
private ISingleFeaturePredictionTransformer <TScalarPredictor> TrainOne(IChannel ch, TScalarTrainer trainer, RoleMappedData data, int cls) { var view = MapLabels(data, cls); string trainerLabel = data.Schema.Label.Value.Name; // REVIEW: In principle we could support validation sets and the like via the train context, but // this is currently unsupported. var transformer = trainer.Fit(view); if (_options.UseProbabilities) { var calibratedModel = transformer.Model as TDistPredictor; // REVIEW: restoring the RoleMappedData, as much as we can. // not having the weight column on the data passed to the TrainCalibrator should be addressed. var trainedData = new RoleMappedData(view, label: trainerLabel, feature: transformer.FeatureColumn); if (calibratedModel == null) { calibratedModel = CalibratorUtils.GetCalibratedPredictor(Host, ch, Calibrator, transformer.Model, trainedData, Args.MaxCalibrationExamples) as TDistPredictor; } Host.Check(calibratedModel != null, "Calibrated predictor does not implement the expected interface"); return(new BinaryPredictionTransformer <TScalarPredictor>(Host, calibratedModel, trainedData.Data.Schema, transformer.FeatureColumn)); } return(new BinaryPredictionTransformer <TScalarPredictor>(Host, transformer.Model, view.Schema, transformer.FeatureColumn)); }
protected override void CheckLabel(RoleMappedData examples) { examples.CheckMultiClassLabel(out _numClasses); }
private protected abstract void CheckLabel(RoleMappedData data);
private ISingleFeaturePredictionTransformer <TDistPredictor> TrainOne(IChannel ch, TScalarTrainer trainer, RoleMappedData data, int cls1, int cls2) { // this should not be necessary when the legacy constructor doesn't exist, and the label column is not an optional parameter on the // MetaMulticlassTrainer constructor. string trainerLabel = data.Schema.Label.Value.Name; var view = MapLabels(data, cls1, cls2); var transformer = trainer.Fit(view); // the validations in the calibrator check for the feature column, in the RoleMappedData var trainedData = new RoleMappedData(view, label: trainerLabel, feature: transformer.FeatureColumn); var calibratedModel = transformer.Model as TDistPredictor; if (calibratedModel == null) { calibratedModel = CalibratorUtils.GetCalibratedPredictor(Host, ch, Calibrator, transformer.Model, trainedData, Args.MaxCalibrationExamples) as TDistPredictor; } return(new BinaryPredictionTransformer <TDistPredictor>(Host, calibratedModel, trainedData.Data.Schema, transformer.FeatureColumn)); }
private MatrixFactorizationModelParameters TrainCore(IChannel ch, RoleMappedData data, RoleMappedData validData = null) { _host.AssertValue(ch); ch.AssertValue(data); ch.AssertValueOrNull(validData); ch.CheckParam(data.Schema.Label.HasValue, nameof(data), "Input data did not have a unique label"); RecommenderUtils.CheckAndGetMatrixIndexColumns(data, out var matrixColumnIndexColInfo, out var matrixRowIndexColInfo, isDecode: false); var labelCol = data.Schema.Label.Value; if (labelCol.Type != NumberDataViewType.Single && labelCol.Type != NumberDataViewType.Double) { throw ch.Except("Column '{0}' for label should be floating point, but is instead {1}", labelCol.Name, labelCol.Type); } MatrixFactorizationModelParameters predictor; if (validData != null) { ch.CheckValue(validData, nameof(validData)); ch.CheckParam(validData.Schema.Label.HasValue, nameof(validData), "Input validation data did not have a unique label"); RecommenderUtils.CheckAndGetMatrixIndexColumns(validData, out var validMatrixColumnIndexColInfo, out var validMatrixRowIndexColInfo, isDecode: false); var validLabelCol = validData.Schema.Label.Value; if (validLabelCol.Type != NumberDataViewType.Single && validLabelCol.Type != NumberDataViewType.Double) { throw ch.Except("Column '{0}' for validation label should be floating point, but is instead {1}", validLabelCol.Name, validLabelCol.Type); } if (!matrixColumnIndexColInfo.Type.Equals(validMatrixColumnIndexColInfo.Type)) { throw ch.ExceptParam(nameof(validData), "Train and validation sets' matrix-column types differed, {0} vs. {1}", matrixColumnIndexColInfo.Type, validMatrixColumnIndexColInfo.Type); } if (!matrixRowIndexColInfo.Type.Equals(validMatrixRowIndexColInfo.Type)) { throw ch.ExceptParam(nameof(validData), "Train and validation sets' matrix-row types differed, {0} vs. {1}", matrixRowIndexColInfo.Type, validMatrixRowIndexColInfo.Type); } } int colCount = matrixColumnIndexColInfo.Type.GetKeyCountAsInt32(_host); int rowCount = matrixRowIndexColInfo.Type.GetKeyCountAsInt32(_host); ch.Assert(rowCount > 0); ch.Assert(colCount > 0); // Checks for equality on the validation set ensure it is correct here. using (var cursor = data.Data.GetRowCursor(matrixColumnIndexColInfo, matrixRowIndexColInfo, data.Schema.Label.Value)) { // LibMF works only over single precision floats, but we want to be able to consume either. var labGetter = RowCursorUtils.GetGetterAs <float>(NumberDataViewType.Single, cursor, data.Schema.Label.Value.Index); var matrixColumnIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberDataViewType.UInt32, cursor, matrixColumnIndexColInfo.Index); var matrixRowIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberDataViewType.UInt32, cursor, matrixRowIndexColInfo.Index); if (validData == null) { // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.Train(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter); predictor = new MatrixFactorizationModelParameters(_host, buffer, (KeyType)matrixColumnIndexColInfo.Type, (KeyType)matrixRowIndexColInfo.Type); } } else { RecommenderUtils.CheckAndGetMatrixIndexColumns(validData, out var validMatrixColumnIndexColInfo, out var validMatrixRowIndexColInfo, isDecode: false); using (var validCursor = validData.Data.GetRowCursor(matrixColumnIndexColInfo, matrixRowIndexColInfo, data.Schema.Label.Value)) { ValueGetter <float> validLabelGetter = RowCursorUtils.GetGetterAs <float>(NumberDataViewType.Single, validCursor, validData.Schema.Label.Value.Index); var validMatrixColumnIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberDataViewType.UInt32, validCursor, validMatrixColumnIndexColInfo.Index); var validMatrixRowIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberDataViewType.UInt32, validCursor, validMatrixRowIndexColInfo.Index); // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.TrainWithValidation(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter, validCursor, validLabelGetter, validMatrixRowIndexGetter, validMatrixColumnIndexGetter); predictor = new MatrixFactorizationModelParameters(_host, buffer, (KeyType)matrixColumnIndexColInfo.Type, (KeyType)matrixRowIndexColInfo.Type); } } } } return(predictor); }
/// <summary> /// This method ensures that the data meets the requirements of this trainer and its /// subclasses, injects necessary transforms, and throws if it couldn't meet them. /// </summary> /// <param name="ch">The channel</param> /// <param name="examples">The training examples</param> /// <param name="weightSetCount">Gets the length of weights and bias array. For binary classification and regression, /// this is 1. For multi-class classification, this equals the number of classes on the label.</param> /// <returns>A potentially modified version of <paramref name="examples"/></returns> private protected RoleMappedData PrepareDataFromTrainingExamples(IChannel ch, RoleMappedData examples, out int weightSetCount) { ch.AssertValue(examples); CheckLabel(examples, out weightSetCount); examples.CheckFeatureFloatVector(); var idvToShuffle = examples.Data; IDataView idvToFeedTrain; if (idvToShuffle.CanShuffle) { idvToFeedTrain = idvToShuffle; } else { var shuffleArgs = new RowShufflingTransformer.Options { PoolOnly = false, ForceShuffle = ShuffleData }; idvToFeedTrain = new RowShufflingTransformer(Host, shuffleArgs, idvToShuffle); } ch.Assert(idvToFeedTrain.CanShuffle); var roles = examples.Schema.GetColumnRoleNames(); var examplesToFeedTrain = new RoleMappedData(idvToFeedTrain, roles); ch.Assert(examplesToFeedTrain.Schema.Label.HasValue); ch.Assert(examplesToFeedTrain.Schema.Feature.HasValue); if (examples.Schema.Weight.HasValue) { ch.Assert(examplesToFeedTrain.Schema.Weight.HasValue); } ch.Check(examplesToFeedTrain.Schema.Feature.Value.Type is VectorType vecType && vecType.Size > 0, "Training set has no features, aborting training."); return(examplesToFeedTrain); }