/// <summary> /// Return a dataset with non-selected features zeroed out. /// </summary> public static RoleMappedData SelectFeatures(IHost host, RoleMappedData data, BitArray features) { Contracts.AssertValue(host); Contracts.AssertValue(data); Contracts.Assert(data.Schema.Feature.HasValue); Contracts.AssertValue(features); var featCol = data.Schema.Feature.Value; var type = featCol.Type; var typeVectorSize = type.GetVectorSize(); Contracts.Assert(features.Length == typeVectorSize); int card = Utils.GetCardinality(features); if (card == typeVectorSize) { return(data); } // REVIEW: This doesn't preserve metadata on the features column. Should it? var name = featCol.Name; var view = LambdaColumnMapper.Create( host, "FeatureSelector", data.Data, name, name, type, type, (in VBuffer <Single> src, ref VBuffer <Single> dst) => SelectFeatures(in src, features, card, ref dst)); var res = new RoleMappedData(view, data.Schema.GetColumnRoleNames()); return(res); }
/// <summary> /// Create the internal transform (not serialized in the zip file). /// </summary> private IDataTransform CreateTemplatedTransform() { IDataView view = Source; var schema = _input.Schema; int index; for (int i = 0; i < _args.columns.Length; ++i) { if (!schema.TryGetColumnIndex(_args.columns[i].Source, out index)) { throw _host.Except("Unable to find '{0}'", _args.columns[i].Source); } var typeCol = schema.GetColumnType(index); if (typeCol.IsVector()) { throw _host.Except("Expected a number as input."); } switch (typeCol.RawKind()) { case DataKind.R4: view = new PassThroughTransform(_host, new PassThroughTransform.Arguments(), LambdaColumnMapper.Create(_host, "R42R4", view, _args.columns[i].Source, _args.columns[i].Name, NumberType.R4, NumberType.R4, (in float src, ref float dst) => { dst = src; })); break;
protected IDataView MapLabelsCore <T>(ColumnType type, RefPredicate <T> equalsTarget, RoleMappedData data, string dstName) { Host.AssertValue(type); Host.Assert(type.RawType == typeof(T)); Host.AssertValue(equalsTarget); Host.AssertValue(data); Host.AssertValue(data.Schema.Label); Host.AssertNonWhiteSpace(dstName); var lab = data.Schema.Label; RefPredicate <T> isMissing; if (!Args.ImputeMissingLabelsAsNegative && Conversions.Instance.TryGetIsNAPredicate(type, out isMissing)) { return(LambdaColumnMapper.Create(Host, "Label mapper", data.Data, lab.Name, dstName, type, NumberType.Float, (ref T src, ref Float dst) => dst = equalsTarget(ref src) ? 1 : (isMissing(ref src) ? Float.NaN : default(Float)))); } return(LambdaColumnMapper.Create(Host, "Label mapper", data.Data, lab.Name, dstName, type, NumberType.Float, (ref T src, ref Float dst) => dst = equalsTarget(ref src) ? 1 : default(Float))); }
private static IDataView AppendFloatMapper <TInput>(IHostEnvironment env, IChannel ch, IDataView input, string col, KeyDataViewType type, int seed) { // Any key is convertible to ulong, so rather than add special case handling for all possible // key-types we just upfront convert it to the most general type (ulong) and work from there. KeyDataViewType dstType = new KeyDataViewType(typeof(ulong), type.Count); bool identity; var converter = Conversions.Instance.GetStandardConversion <TInput, ulong>(type, dstType, out identity); var isNa = Conversions.Instance.GetIsNAPredicate <TInput>(type); ValueMapper <TInput, Single> mapper; if (seed == 0) { mapper = (in TInput src, ref Single dst) => { //Attention: This method is called from multiple threads. //Do not move the temp variable outside this method. //If you do, the variable is shared between the threads and results in a race condition. ulong temp = 0; if (isNa(in src)) { dst = Single.NaN; return; } converter(in src, ref temp); dst = (Single)temp - 1; }; } else { ch.Check(type.Count > 0, "Label must be of known cardinality."); int[] permutation = Utils.GetRandomPermutation(RandomUtils.Create(seed), type.GetCountAsInt32(env)); mapper = (in TInput src, ref Single dst) => { //Attention: This method is called from multiple threads. //Do not move the temp variable outside this method. //If you do, the variable is shared between the threads and results in a race condition. ulong temp = 0; if (isNa(in src)) { dst = Single.NaN; return; } converter(in src, ref temp); dst = (Single)permutation[(int)(temp - 1)]; }; } return(LambdaColumnMapper.Create(env, "Key to Float Mapper", input, col, col, type, NumberDataViewType.Single, mapper)); }
private static IDataView AppendFloatMapper <TInput>(IHostEnvironment env, IChannel ch, IDataView input, string col, KeyType type, int seed) { // Any key is convertible to ulong, so rather than add special case handling for all possible // key-types we just upfront convert it to the most general type (ulong) and work from there. KeyType dstType = new KeyType(DataKind.U8, type.Min, type.Count, type.Contiguous); bool identity; var converter = Conversions.Instance.GetStandardConversion <TInput, ulong>(type, dstType, out identity); var isNa = Conversions.Instance.GetIsNAPredicate <TInput>(type); ulong temp = 0; ValueMapper <TInput, Single> mapper; if (seed == 0) { mapper = (in TInput src, ref Single dst) => { if (isNa(in src)) { dst = Single.NaN; return; } converter(in src, ref temp); dst = (Single)(temp - 1); }; } else { ch.Check(type.Count > 0, "Label must be of known cardinality."); int[] permutation = Utils.GetRandomPermutation(RandomUtils.Create(seed), type.Count); mapper = (in TInput src, ref Single dst) => { if (isNa(in src)) { dst = Single.NaN; return; } converter(in src, ref temp); dst = (Single)permutation[(int)(temp - 1)]; }; } return(LambdaColumnMapper.Create(env, "Key to Float Mapper", input, col, col, type, NumberType.Float, mapper)); }
private protected IDataView MapLabelsCore<T>(DataViewType type, InPredicate<T> equalsTarget, RoleMappedData data) { Host.AssertValue(type); Host.Assert(type.RawType == typeof(T)); Host.AssertValue(equalsTarget); Host.AssertValue(data); Host.Assert(data.Schema.Label.HasValue); var label = data.Schema.Label.Value; IDataView dataView = data.Data; if (!Args.ImputeMissingLabelsAsNegative) dataView = new NAFilter(Host, data.Data, false, label.Name); return LambdaColumnMapper.Create(Host, "Label mapper", data.Data, label.Name, label.Name, type, BooleanDataViewType.Instance, (in T src, ref bool dst) => dst = equalsTarget(in src) ? true : false); }
private IDataView MapLabels(RoleMappedData data, int cls, out string dstName, IChannel ch) { var lab = data.Schema.Label.Value; Host.Assert(!data.Schema.Schema[lab.Index].IsHidden); Host.Assert(lab.Type.GetKeyCount() > 0 || lab.Type == NumberDataViewType.Single || lab.Type == NumberDataViewType.Double); // Get the destination label column name. dstName = data.Schema.Schema.GetTempColumnName(); // Key values are 1-based. if (lab.Type.GetKeyCount() > 0) { uint key = (uint)(cls + 1); if (_args.downsampling > 0) { return(CreateTrainingView(data, key, 1f, -1f, 0f, NumberDataViewType.UInt32, NumberDataViewType.Single, ch)); } else { return(LambdaColumnMapper.Create(Host, "LabelColumnMapper in oOVA (1)", FilterNA(data.Data, lab.Name), lab.Name, dstName, NumberDataViewType.UInt32, NumberDataViewType.Single, (in uint src, ref float dst) => { dst = src == key ? 1 : default(float); }));
private protected IDataView MapLabelsCore<T>(ColumnType type, InPredicate<T> equalsTarget, RoleMappedData data) { Host.AssertValue(type); Host.Assert(type.RawType == typeof(T)); Host.AssertValue(equalsTarget); Host.AssertValue(data); Host.Assert(data.Schema.Label.HasValue); var lab = data.Schema.Label.Value; InPredicate<T> isMissing; if (!Args.ImputeMissingLabelsAsNegative && Conversions.Instance.TryGetIsNAPredicate(type, out isMissing)) { return LambdaColumnMapper.Create(Host, "Label mapper", data.Data, lab.Name, lab.Name, type, NumberType.Float, (in T src, ref float dst) => dst = equalsTarget(in src) ? 1 : (isMissing(in src) ? float.NaN : default(float))); } return LambdaColumnMapper.Create(Host, "Label mapper", data.Data, lab.Name, lab.Name, type, NumberType.Float, (in T src, ref float dst) => dst = equalsTarget(in src) ? 1 : default(float)); }
/// <summary> /// Create the internal transform (not serialized in the zip file). /// </summary> private IDataTransform CreateTemplatedTransform() { IDataView view = Source; var schema = _input.Schema; int index; for (int i = 0; i < _args.columns.Length; ++i) { index = SchemaHelper.GetColumnIndex(schema, _args.columns[i].Source); var typeCol = schema[index].Type; if (typeCol.IsVector()) { throw _host.Except("Expected a number as input."); } switch (typeCol.RawKind()) { case DataKind.Single: view = new PassThroughTransform(_host, new PassThroughTransform.Arguments(), LambdaColumnMapper.Create(_host, "R42R4", view, _args.columns[i].Source, _args.columns[i].Name, NumberDataViewType.Single, NumberDataViewType.Single, (in float src, ref float dst) => { dst = src; })); break;
IDataTransform AppendToPipeline(IDataView input) { IDataView current = input; if (_shuffleInput) { var args1 = new RowShufflingTransformer.Arguments() { ForceShuffle = false, ForceShuffleSeed = _seedShuffle, PoolRows = _poolRows, PoolOnly = false, }; current = new RowShufflingTransformer(Host, args1, current); } // We generate a random number. var columnName = current.Schema.GetTempColumnName(); var args2 = new GenerateNumberTransform.Arguments() { Column = new GenerateNumberTransform.Column[] { new GenerateNumberTransform.Column() { Name = columnName } }, Seed = _seed ?? 42 }; IDataTransform currentTr = new GenerateNumberTransform(Host, args2, current); // We convert this random number into a part. var cRatios = new float[_ratios.Length]; cRatios[0] = 0; for (int i = 1; i < _ratios.Length; ++i) { cRatios[i] = cRatios[i - 1] + _ratios[i - 1]; } ValueMapper <float, int> mapper = (in float src, ref int dst) => { for (int i = cRatios.Length - 1; i > 0; --i) { if (src >= cRatios[i]) { dst = i; return; } } dst = 0; }; // Get location of columnName int index; currentTr.Schema.TryGetColumnIndex(columnName, out index); var ct = currentTr.Schema.GetColumnType(index); var view = LambdaColumnMapper.Create(Host, "Key to part mapper", currentTr, columnName, _newColumn, ct, NumberType.I4, mapper); // We cache the result to avoid the pipeline to change the random number. var args3 = new ExtendedCacheTransform.Arguments() { inDataFrame = string.IsNullOrEmpty(_cacheFile), numTheads = _numThreads, cacheFile = _cacheFile, reuse = _reuse, }; currentTr = new ExtendedCacheTransform(Host, args3, view); // Removing the temporary column. var finalTr = ColumnSelectingTransformer.CreateDrop(Host, currentTr, new string[] { columnName }); var taggedViews = new List <Tuple <string, ITaggedDataView> >(); // filenames if (_filenames != null || _tags != null) { int nbf = _filenames == null ? 0 : _filenames.Length; if (nbf > 0 && nbf != _ratios.Length) { throw Host.Except("Differen number of filenames and ratios."); } int nbt = _tags == null ? 0 : _tags.Length; if (nbt > 0 && nbt != _ratios.Length) { throw Host.Except("Differen number of filenames and ratios."); } int nb = Math.Max(nbf, nbt); using (var ch = Host.Start("Split the datasets and stores each part.")) { for (int i = 0; i < nb; ++i) { if (_filenames == null || !_filenames.Any()) { ch.Info("Create part {0}: {1} (tag: {2})", i + 1, _ratios[i], _tags[i]); } else { ch.Info("Create part {0}: {1} (file: {2})", i + 1, _ratios[i], _filenames[i]); } var ar1 = new RangeFilter.Arguments() { Column = _newColumn, Min = i, Max = i, IncludeMax = true }; int pardId = i; var filtView = LambdaFilter.Create <int>(Host, string.Format("Select part {0}", i), currentTr, _newColumn, NumberType.I4, (in int part) => { return(part.Equals(pardId)); });