Пример #1
0
        /// <summary>
        /// Return a dataset with non-selected features zeroed out.
        /// </summary>
        public static RoleMappedData SelectFeatures(IHost host, RoleMappedData data, BitArray features)
        {
            Contracts.AssertValue(host);
            Contracts.AssertValue(data);
            Contracts.Assert(data.Schema.Feature.HasValue);
            Contracts.AssertValue(features);
            var featCol = data.Schema.Feature.Value;

            var type           = featCol.Type;
            var typeVectorSize = type.GetVectorSize();

            Contracts.Assert(features.Length == typeVectorSize);
            int card = Utils.GetCardinality(features);

            if (card == typeVectorSize)
            {
                return(data);
            }

            // REVIEW: This doesn't preserve metadata on the features column. Should it?
            var name = featCol.Name;
            var view = LambdaColumnMapper.Create(
                host, "FeatureSelector", data.Data, name, name, type, type,
                (in VBuffer <Single> src, ref VBuffer <Single> dst) => SelectFeatures(in src, features, card, ref dst));

            var res = new RoleMappedData(view, data.Schema.GetColumnRoleNames());

            return(res);
        }
Пример #2
0
        /// <summary>
        /// Create the internal transform (not serialized in the zip file).
        /// </summary>
        private IDataTransform CreateTemplatedTransform()
        {
            IDataView view   = Source;
            var       schema = _input.Schema;
            int       index;

            for (int i = 0; i < _args.columns.Length; ++i)
            {
                if (!schema.TryGetColumnIndex(_args.columns[i].Source, out index))
                {
                    throw _host.Except("Unable to find '{0}'", _args.columns[i].Source);
                }
                var typeCol = schema.GetColumnType(index);
                if (typeCol.IsVector())
                {
                    throw _host.Except("Expected a number as input.");
                }

                switch (typeCol.RawKind())
                {
                case DataKind.R4:
                    view = new PassThroughTransform(_host, new PassThroughTransform.Arguments(),
                                                    LambdaColumnMapper.Create(_host, "R42R4", view,
                                                                              _args.columns[i].Source, _args.columns[i].Name,
                                                                              NumberType.R4, NumberType.R4,
                                                                              (in float src, ref float dst) => { dst = src; }));
                    break;
Пример #3
0
        protected IDataView MapLabelsCore <T>(ColumnType type, RefPredicate <T> equalsTarget, RoleMappedData data, string dstName)
        {
            Host.AssertValue(type);
            Host.Assert(type.RawType == typeof(T));
            Host.AssertValue(equalsTarget);
            Host.AssertValue(data);
            Host.AssertValue(data.Schema.Label);
            Host.AssertNonWhiteSpace(dstName);

            var lab = data.Schema.Label;

            RefPredicate <T> isMissing;

            if (!Args.ImputeMissingLabelsAsNegative && Conversions.Instance.TryGetIsNAPredicate(type, out isMissing))
            {
                return(LambdaColumnMapper.Create(Host, "Label mapper", data.Data,
                                                 lab.Name, dstName, type, NumberType.Float,
                                                 (ref T src, ref Float dst) =>
                                                 dst = equalsTarget(ref src) ? 1 : (isMissing(ref src) ? Float.NaN : default(Float))));
            }
            return(LambdaColumnMapper.Create(Host, "Label mapper", data.Data,
                                             lab.Name, dstName, type, NumberType.Float,
                                             (ref T src, ref Float dst) =>
                                             dst = equalsTarget(ref src) ? 1 : default(Float)));
        }
        private static IDataView AppendFloatMapper <TInput>(IHostEnvironment env, IChannel ch, IDataView input,
                                                            string col, KeyDataViewType type, int seed)
        {
            // Any key is convertible to ulong, so rather than add special case handling for all possible
            // key-types we just upfront convert it to the most general type (ulong) and work from there.
            KeyDataViewType dstType = new KeyDataViewType(typeof(ulong), type.Count);
            bool            identity;
            var             converter = Conversions.Instance.GetStandardConversion <TInput, ulong>(type, dstType, out identity);
            var             isNa      = Conversions.Instance.GetIsNAPredicate <TInput>(type);

            ValueMapper <TInput, Single> mapper;

            if (seed == 0)
            {
                mapper =
                    (in TInput src, ref Single dst) =>
                {
                    //Attention: This method is called from multiple threads.
                    //Do not move the temp variable outside this method.
                    //If you do, the variable is shared between the threads and results in a race condition.
                    ulong temp = 0;
                    if (isNa(in src))
                    {
                        dst = Single.NaN;
                        return;
                    }
                    converter(in src, ref temp);
                    dst = (Single)temp - 1;
                };
            }
            else
            {
                ch.Check(type.Count > 0, "Label must be of known cardinality.");
                int[] permutation = Utils.GetRandomPermutation(RandomUtils.Create(seed), type.GetCountAsInt32(env));
                mapper =
                    (in TInput src, ref Single dst) =>
                {
                    //Attention: This method is called from multiple threads.
                    //Do not move the temp variable outside this method.
                    //If you do, the variable is shared between the threads and results in a race condition.
                    ulong temp = 0;
                    if (isNa(in src))
                    {
                        dst = Single.NaN;
                        return;
                    }
                    converter(in src, ref temp);
                    dst = (Single)permutation[(int)(temp - 1)];
                };
            }

            return(LambdaColumnMapper.Create(env, "Key to Float Mapper", input, col, col, type, NumberDataViewType.Single, mapper));
        }
        private static IDataView AppendFloatMapper <TInput>(IHostEnvironment env, IChannel ch, IDataView input,
                                                            string col, KeyType type, int seed)
        {
            // Any key is convertible to ulong, so rather than add special case handling for all possible
            // key-types we just upfront convert it to the most general type (ulong) and work from there.
            KeyType dstType = new KeyType(DataKind.U8, type.Min, type.Count, type.Contiguous);
            bool    identity;
            var     converter = Conversions.Instance.GetStandardConversion <TInput, ulong>(type, dstType, out identity);
            var     isNa      = Conversions.Instance.GetIsNAPredicate <TInput>(type);
            ulong   temp      = 0;

            ValueMapper <TInput, Single> mapper;

            if (seed == 0)
            {
                mapper =
                    (in TInput src, ref Single dst) =>
                {
                    if (isNa(in src))
                    {
                        dst = Single.NaN;
                        return;
                    }
                    converter(in src, ref temp);
                    dst = (Single)(temp - 1);
                };
            }
            else
            {
                ch.Check(type.Count > 0, "Label must be of known cardinality.");
                int[] permutation = Utils.GetRandomPermutation(RandomUtils.Create(seed), type.Count);
                mapper =
                    (in TInput src, ref Single dst) =>
                {
                    if (isNa(in src))
                    {
                        dst = Single.NaN;
                        return;
                    }
                    converter(in src, ref temp);
                    dst = (Single)permutation[(int)(temp - 1)];
                };
            }

            return(LambdaColumnMapper.Create(env, "Key to Float Mapper", input, col, col, type, NumberType.Float, mapper));
        }
Пример #6
0
        private protected IDataView MapLabelsCore<T>(DataViewType type, InPredicate<T> equalsTarget, RoleMappedData data)
        {
            Host.AssertValue(type);
            Host.Assert(type.RawType == typeof(T));
            Host.AssertValue(equalsTarget);
            Host.AssertValue(data);
            Host.Assert(data.Schema.Label.HasValue);

            var label = data.Schema.Label.Value;
            IDataView dataView = data.Data;
            if (!Args.ImputeMissingLabelsAsNegative)
                dataView = new NAFilter(Host, data.Data, false, label.Name);

            return LambdaColumnMapper.Create(Host, "Label mapper", data.Data,
                label.Name, label.Name, type, BooleanDataViewType.Instance,
                (in T src, ref bool dst) =>
                    dst = equalsTarget(in src) ? true : false);
        }
        private IDataView MapLabels(RoleMappedData data, int cls, out string dstName, IChannel ch)
        {
            var lab = data.Schema.Label.Value;

            Host.Assert(!data.Schema.Schema[lab.Index].IsHidden);
            Host.Assert(lab.Type.GetKeyCount() > 0 || lab.Type == NumberDataViewType.Single || lab.Type == NumberDataViewType.Double);

            // Get the destination label column name.
            dstName = data.Schema.Schema.GetTempColumnName();

            // Key values are 1-based.
            if (lab.Type.GetKeyCount() > 0)
            {
                uint key = (uint)(cls + 1);
                if (_args.downsampling > 0)
                {
                    return(CreateTrainingView(data, key, 1f, -1f, 0f, NumberDataViewType.UInt32, NumberDataViewType.Single, ch));
                }
                else
                {
                    return(LambdaColumnMapper.Create(Host, "LabelColumnMapper in oOVA (1)", FilterNA(data.Data, lab.Name),
                                                     lab.Name, dstName, NumberDataViewType.UInt32, NumberDataViewType.Single,
                                                     (in uint src, ref float dst) => { dst = src == key ? 1 : default(float); }));
        private protected IDataView MapLabelsCore<T>(ColumnType type, InPredicate<T> equalsTarget, RoleMappedData data)
        {
            Host.AssertValue(type);
            Host.Assert(type.RawType == typeof(T));
            Host.AssertValue(equalsTarget);
            Host.AssertValue(data);
            Host.Assert(data.Schema.Label.HasValue);

            var lab = data.Schema.Label.Value;

            InPredicate<T> isMissing;
            if (!Args.ImputeMissingLabelsAsNegative && Conversions.Instance.TryGetIsNAPredicate(type, out isMissing))
            {
                return LambdaColumnMapper.Create(Host, "Label mapper", data.Data,
                    lab.Name, lab.Name, type, NumberType.Float,
                    (in T src, ref float dst) =>
                        dst = equalsTarget(in src) ? 1 : (isMissing(in src) ? float.NaN : default(float)));
            }
            return LambdaColumnMapper.Create(Host, "Label mapper", data.Data,
                lab.Name, lab.Name, type, NumberType.Float,
                (in T src, ref float dst) =>
                    dst = equalsTarget(in src) ? 1 : default(float));
        }
        /// <summary>
        /// Create the internal transform (not serialized in the zip file).
        /// </summary>
        private IDataTransform CreateTemplatedTransform()
        {
            IDataView view   = Source;
            var       schema = _input.Schema;
            int       index;

            for (int i = 0; i < _args.columns.Length; ++i)
            {
                index = SchemaHelper.GetColumnIndex(schema, _args.columns[i].Source);
                var typeCol = schema[index].Type;
                if (typeCol.IsVector())
                {
                    throw _host.Except("Expected a number as input.");
                }

                switch (typeCol.RawKind())
                {
                case DataKind.Single:
                    view = new PassThroughTransform(_host, new PassThroughTransform.Arguments(),
                                                    LambdaColumnMapper.Create(_host, "R42R4", view,
                                                                              _args.columns[i].Source, _args.columns[i].Name,
                                                                              NumberDataViewType.Single, NumberDataViewType.Single,
                                                                              (in float src, ref float dst) => { dst = src; }));
                    break;
Пример #10
0
        IDataTransform AppendToPipeline(IDataView input)
        {
            IDataView current = input;

            if (_shuffleInput)
            {
                var args1 = new RowShufflingTransformer.Arguments()
                {
                    ForceShuffle     = false,
                    ForceShuffleSeed = _seedShuffle,
                    PoolRows         = _poolRows,
                    PoolOnly         = false,
                };
                current = new RowShufflingTransformer(Host, args1, current);
            }

            // We generate a random number.
            var columnName = current.Schema.GetTempColumnName();
            var args2      = new GenerateNumberTransform.Arguments()
            {
                Column = new GenerateNumberTransform.Column[] { new GenerateNumberTransform.Column()
                                                                {
                                                                    Name = columnName
                                                                } },
                Seed = _seed ?? 42
            };
            IDataTransform currentTr = new GenerateNumberTransform(Host, args2, current);

            // We convert this random number into a part.
            var cRatios = new float[_ratios.Length];

            cRatios[0] = 0;
            for (int i = 1; i < _ratios.Length; ++i)
            {
                cRatios[i] = cRatios[i - 1] + _ratios[i - 1];
            }

            ValueMapper <float, int> mapper = (in float src, ref int dst) =>
            {
                for (int i = cRatios.Length - 1; i > 0; --i)
                {
                    if (src >= cRatios[i])
                    {
                        dst = i;
                        return;
                    }
                }
                dst = 0;
            };

            // Get location of columnName

            int index;

            currentTr.Schema.TryGetColumnIndex(columnName, out index);
            var ct   = currentTr.Schema.GetColumnType(index);
            var view = LambdaColumnMapper.Create(Host, "Key to part mapper", currentTr,
                                                 columnName, _newColumn, ct, NumberType.I4, mapper);

            // We cache the result to avoid the pipeline to change the random number.
            var args3 = new ExtendedCacheTransform.Arguments()
            {
                inDataFrame = string.IsNullOrEmpty(_cacheFile),
                numTheads   = _numThreads,
                cacheFile   = _cacheFile,
                reuse       = _reuse,
            };

            currentTr = new ExtendedCacheTransform(Host, args3, view);

            // Removing the temporary column.
            var finalTr     = ColumnSelectingTransformer.CreateDrop(Host, currentTr, new string[] { columnName });
            var taggedViews = new List <Tuple <string, ITaggedDataView> >();

            // filenames
            if (_filenames != null || _tags != null)
            {
                int nbf = _filenames == null ? 0 : _filenames.Length;
                if (nbf > 0 && nbf != _ratios.Length)
                {
                    throw Host.Except("Differen number of filenames and ratios.");
                }
                int nbt = _tags == null ? 0 : _tags.Length;
                if (nbt > 0 && nbt != _ratios.Length)
                {
                    throw Host.Except("Differen number of filenames and ratios.");
                }
                int nb = Math.Max(nbf, nbt);

                using (var ch = Host.Start("Split the datasets and stores each part."))
                {
                    for (int i = 0; i < nb; ++i)
                    {
                        if (_filenames == null || !_filenames.Any())
                        {
                            ch.Info("Create part {0}: {1} (tag: {2})", i + 1, _ratios[i], _tags[i]);
                        }
                        else
                        {
                            ch.Info("Create part {0}: {1} (file: {2})", i + 1, _ratios[i], _filenames[i]);
                        }
                        var ar1 = new RangeFilter.Arguments()
                        {
                            Column = _newColumn, Min = i, Max = i, IncludeMax = true
                        };
                        int pardId   = i;
                        var filtView = LambdaFilter.Create <int>(Host, string.Format("Select part {0}", i), currentTr,
                                                                 _newColumn, NumberType.I4,
                                                                 (in int part) => { return(part.Equals(pardId)); });