/// <summary> /// This method ensures that the data meets the requirements of this trainer and its /// subclasses, injects necessary transforms, and throws if it couldn't meet them. /// </summary> /// <param name="ch">The channel</param> /// <param name="examples">The training examples</param> /// <param name="weightSetCount">Gets the length of weights and bias array. For binary classification and regression, /// this is 1. For multi-class classification, this equals the number of classes on the label.</param> /// <returns>A potentially modified version of <paramref name="examples"/></returns> protected RoleMappedData PrepareDataFromTrainingExamples(IChannel ch, RoleMappedData examples, out int weightSetCount) { ch.AssertValue(examples); CheckLabel(examples, out weightSetCount); examples.CheckFeatureFloatVector(); var idvToShuffle = examples.Data; IDataView idvToFeedTrain; if (idvToShuffle.CanShuffle) { idvToFeedTrain = idvToShuffle; } else { var shuffleArgs = new RowShufflingTransformer.Arguments { PoolOnly = false, ForceShuffle = ShuffleData }; idvToFeedTrain = new RowShufflingTransformer(Host, shuffleArgs, idvToShuffle); } ch.Assert(idvToFeedTrain.CanShuffle); var roles = examples.Schema.GetColumnRoleNames(); var examplesToFeedTrain = new RoleMappedData(idvToFeedTrain, roles); ch.AssertValue(examplesToFeedTrain.Schema.Label); ch.AssertValue(examplesToFeedTrain.Schema.Feature); if (examples.Schema.Weight != null) { ch.AssertValue(examplesToFeedTrain.Schema.Weight); } int numFeatures = examplesToFeedTrain.Schema.Feature.Type.VectorSize; ch.Check(numFeatures > 0, "Training set has no features, aborting training."); return(examplesToFeedTrain); }
IDataTransform AppendToPipeline(IDataView input) { IDataView current = input; if (_shuffleInput) { var args1 = new RowShufflingTransformer.Arguments() { ForceShuffle = false, ForceShuffleSeed = _seedShuffle, PoolRows = _poolRows, PoolOnly = false, }; current = new RowShufflingTransformer(Host, args1, current); } // We generate a random number. var columnName = current.Schema.GetTempColumnName(); var args2 = new GenerateNumberTransform.Arguments() { Column = new GenerateNumberTransform.Column[] { new GenerateNumberTransform.Column() { Name = columnName } }, Seed = _seed ?? 42 }; IDataTransform currentTr = new GenerateNumberTransform(Host, args2, current); // We convert this random number into a part. var cRatios = new float[_ratios.Length]; cRatios[0] = 0; for (int i = 1; i < _ratios.Length; ++i) { cRatios[i] = cRatios[i - 1] + _ratios[i - 1]; } ValueMapper <float, int> mapper = (in float src, ref int dst) => { for (int i = cRatios.Length - 1; i > 0; --i) { if (src >= cRatios[i]) { dst = i; return; } } dst = 0; }; // Get location of columnName int index; currentTr.Schema.TryGetColumnIndex(columnName, out index); var ct = currentTr.Schema.GetColumnType(index); var view = LambdaColumnMapper.Create(Host, "Key to part mapper", currentTr, columnName, _newColumn, ct, NumberType.I4, mapper); // We cache the result to avoid the pipeline to change the random number. var args3 = new ExtendedCacheTransform.Arguments() { inDataFrame = string.IsNullOrEmpty(_cacheFile), numTheads = _numThreads, cacheFile = _cacheFile, reuse = _reuse, }; currentTr = new ExtendedCacheTransform(Host, args3, view); // Removing the temporary column. var finalTr = ColumnSelectingTransformer.CreateDrop(Host, currentTr, new string[] { columnName }); var taggedViews = new List <Tuple <string, ITaggedDataView> >(); // filenames if (_filenames != null || _tags != null) { int nbf = _filenames == null ? 0 : _filenames.Length; if (nbf > 0 && nbf != _ratios.Length) { throw Host.Except("Differen number of filenames and ratios."); } int nbt = _tags == null ? 0 : _tags.Length; if (nbt > 0 && nbt != _ratios.Length) { throw Host.Except("Differen number of filenames and ratios."); } int nb = Math.Max(nbf, nbt); using (var ch = Host.Start("Split the datasets and stores each part.")) { for (int i = 0; i < nb; ++i) { if (_filenames == null || !_filenames.Any()) { ch.Info("Create part {0}: {1} (tag: {2})", i + 1, _ratios[i], _tags[i]); } else { ch.Info("Create part {0}: {1} (file: {2})", i + 1, _ratios[i], _filenames[i]); } var ar1 = new RangeFilter.Arguments() { Column = _newColumn, Min = i, Max = i, IncludeMax = true }; int pardId = i; var filtView = LambdaFilter.Create <int>(Host, string.Format("Select part {0}", i), currentTr, _newColumn, NumberType.I4, (in int part) => { return(part.Equals(pardId)); });