/// <summary>
        /// This is for the indicator case - vector input and outputs should be concatenated.
        /// </summary>
        private ValueGetter <VBuffer <float> > MakeGetterInd(IRow input, int iinfo)
        {
            Host.AssertValue(input);
            Host.Assert(Infos[iinfo].TypeSrc.IsVector);
            Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey);

            int cv = Infos[iinfo].TypeSrc.VectorSize;

            Host.Assert(cv >= 0);

            var getSrc     = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, Infos[iinfo].Source);
            var src        = default(VBuffer <uint>);
            var bldr       = new BufferBuilder <float>(R4Adder.Instance);
            int bitsPerKey = _bitsPerKey[iinfo];

            return
                ((ref VBuffer <float> dst) =>
            {
                getSrc(ref src);
                Host.Check(src.Length == cv || cv == 0);
                bldr.Reset(src.Length * bitsPerKey, false);

                int index = 0;
                foreach (uint value in src.DenseValues())
                {
                    EncodeValueToBinary(bldr, value, bitsPerKey, index * bitsPerKey);
                    index++;
                }

                bldr.GetResult(ref dst);

                Contracts.Assert(dst.Length == src.Length * bitsPerKey);
            });
        }
 public SlotCursor(IChannelProvider provider, ISlotCursor cursor, VectorType typeDst)
     : base(provider, TransposerUtils.GetRowCursorShim(provider, cursor))
 {
     Ch.Assert(Input.Schema.ColumnCount == 1);
     Ch.Assert(Input.Schema.GetColumnType(0) == cursor.GetSlotType());
     Ch.AssertValue(typeDst);
     _getter = RowCursorUtils.GetVecGetterAs(typeDst.ItemType, Input, 0);
     _type   = typeDst;
 }
        protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer)
        {
            Host.AssertValueOrNull(ch);
            Host.AssertValue(input);
            Host.Assert(0 <= iinfo && iinfo < Infos.Length);
            disposer = null;

            var typeSrc = Infos[iinfo].TypeSrc;
            var typeDst = _exes[iinfo].TypeDst;

            if (!typeDst.IsVector)
            {
                return(RowCursorUtils.GetGetterAs(typeDst, input, Infos[iinfo].Source));
            }
            return(RowCursorUtils.GetVecGetterAs(typeDst.AsVector.ItemType, input, Infos[iinfo].Source));
        }
            public override void InitializeNextPass(IRow row, RoleMappedSchema schema)
            {
                Contracts.Assert(PassNum < 1);
                Contracts.AssertValue(schema.Label);

                var score = schema.GetUniqueColumn(MetadataUtils.Const.ScoreValueKind.Score);

                _labelGetter = RowCursorUtils.GetVecGetterAs <Float>(NumberType.Float, row, schema.Label.Index);
                _scoreGetter = row.GetGetter <VBuffer <Float> >(score.Index);
                Contracts.AssertValue(_labelGetter);
                Contracts.AssertValue(_scoreGetter);

                if (schema.Weight != null)
                {
                    _weightGetter = row.GetGetter <Float>(schema.Weight.Index);
                }
            }
        public static ValueGetter <VBuffer <Single> > GetLabelGetter(ISlotCursor cursor)
        {
            var type = cursor.GetSlotType().ItemType;

            if (type == NumberType.R4)
            {
                return(cursor.GetGetter <Single>());
            }
            if (type == NumberType.R8 || type.IsBool)
            {
                return(GetVecGetterAs <Single>(NumberType.R4, cursor));
            }
            Contracts.Check(type.IsKey, "Only floating point number, boolean, and key type values can be used as label.");
            Contracts.Assert(TestGetLabelGetter(type) == null);
            ulong keyMax = (ulong)type.KeyCount;

            if (keyMax == 0)
            {
                keyMax = ulong.MaxValue;
            }
            var             getSrc = RowCursorUtils.GetVecGetterAs <ulong>(NumberType.U8, cursor);
            VBuffer <ulong> src    = default(VBuffer <ulong>);

            return
                ((ref VBuffer <Single> dst) =>
            {
                getSrc(ref src);
                // Unfortunately defaults in one to not translate to defaults of the other,
                // so this will not be sparsity preserving. Assume a dense output.
                Single[] vals = dst.Values;
                Utils.EnsureSize(ref vals, src.Length);
                foreach (var kv in src.Items(all: true))
                {
                    if (0 < kv.Value && kv.Value <= keyMax)
                    {
                        vals[kv.Key] = kv.Value - 1;
                    }
                    else
                    {
                        vals[kv.Key] = Single.NaN;
                    }
                }
                dst = new VBuffer <Single>(src.Length, vals, dst.Indices);
            });
        }
        /// <summary>
        /// This is for the bagging case - vector input and outputs should be added.
        /// </summary>
        private ValueGetter <VBuffer <Float> > MakeGetterBag(IRow input, int iinfo)
        {
            Host.AssertValue(input);
            Host.Assert(Infos[iinfo].TypeSrc.IsVector);
            Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey);
            Host.Assert(_bag[iinfo]);
            Host.Assert(Infos[iinfo].TypeSrc.ItemType.KeyCount == _types[iinfo].VectorSize);

            var info = Infos[iinfo];
            int size = info.TypeSrc.ItemType.KeyCount;

            Host.Assert(size > 0);

            int cv = info.TypeSrc.VectorSize;

            Host.Assert(cv >= 0);

            var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, info.Source);
            var src    = default(VBuffer <uint>);
            var bldr   = BufferBuilder <float> .CreateDefault();

            return
                ((ref VBuffer <Float> dst) =>
            {
                bldr.Reset(size, false);

                getSrc(ref src);
                Host.Check(cv == 0 || src.Length == cv);

                // The indices are irrelevant in the bagging case.
                var values = src.Values;
                int count = src.Count;
                for (int slot = 0; slot < count; slot++)
                {
                    uint key = values[slot] - 1;
                    if (key < size)
                    {
                        bldr.AddFeature((int)key, 1);
                    }
                }

                bldr.GetResult(ref dst);
            });
        }
Exemple #7
0
        private Delegate MakeGetter(IChannel ch, IRow input, int iinfo, FinderDecorator decorator = null)
        {
            ch.Assert(_bindings.Infos[iinfo].SrcTypes.All(t => t.IsVector && t.ItemType.IsKey));

            var info     = _bindings.Infos[iinfo];
            int srcCount = info.SrcIndices.Length;

            ValueGetter <VBuffer <uint> >[] getSrc = new ValueGetter <VBuffer <uint> > [srcCount];
            for (int isrc = 0; isrc < srcCount; isrc++)
            {
                getSrc[isrc] = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, info.SrcIndices[isrc]);
            }
            var src           = default(VBuffer <uint>);
            var ngramIdFinder = GetNgramIdFinder(iinfo);

            if (decorator != null)
            {
                ngramIdFinder = decorator(iinfo, ngramIdFinder);
            }
            var bldr = new NgramBufferBuilder(_exes[iinfo].NgramLength, _exes[iinfo].SkipLength,
                                              _bindings.Types[iinfo].ValueCount, ngramIdFinder);
            var keyCounts = _bindings.Infos[iinfo].SrcTypes.Select(
                t => t.ItemType.KeyCount > 0 ? (uint)t.ItemType.KeyCount : uint.MaxValue).ToArray();

            // REVIEW: Special casing the srcCount==1 case could potentially improve perf.
            ValueGetter <VBuffer <Float> > del =
                (ref VBuffer <Float> dst) =>
            {
                bldr.Reset();
                for (int i = 0; i < srcCount; i++)
                {
                    getSrc[i](ref src);
                    bldr.AddNgrams(ref src, i, keyCounts[i]);
                }
                bldr.GetResult(ref dst);
            };

            return(del);
        }
        public override Delegate[] CreateGetters(IRow input, Func <int, bool> activeCols, out Action disposer)
        {
            Host.Assert(LabelIndex >= 0);
            Host.Assert(ScoreIndex >= 0);

            disposer = null;

            long cachedPosition = -1;
            var  label          = default(VBuffer <Float>);
            var  score          = default(VBuffer <Float>);

            ValueGetter <VBuffer <Float> > nullGetter = (ref VBuffer <Float> vec) => vec = default(VBuffer <Float>);
            var labelGetter = activeCols(LabelOutput) || activeCols(L1Output) || activeCols(L2Output) || activeCols(DistCol)
                ? RowCursorUtils.GetVecGetterAs <Float>(NumberType.Float, input, LabelIndex)
                : nullGetter;
            var scoreGetter = activeCols(ScoreOutput) || activeCols(L1Output) || activeCols(L2Output) || activeCols(DistCol)
                ? input.GetGetter <VBuffer <Float> >(ScoreIndex)
                : nullGetter;
            Action updateCacheIfNeeded =
                () =>
            {
                if (cachedPosition != input.Position)
                {
                    labelGetter(ref label);
                    scoreGetter(ref score);
                    cachedPosition = input.Position;
                }
            };

            var getters = new Delegate[5];

            if (activeCols(LabelOutput))
            {
                ValueGetter <VBuffer <Float> > labelFn =
                    (ref VBuffer <Float> dst) =>
                {
                    updateCacheIfNeeded();
                    label.CopyTo(ref dst);
                };
                getters[LabelOutput] = labelFn;
            }
            if (activeCols(ScoreOutput))
            {
                ValueGetter <VBuffer <Float> > scoreFn =
                    (ref VBuffer <Float> dst) =>
                {
                    updateCacheIfNeeded();
                    score.CopyTo(ref dst);
                };
                getters[ScoreOutput] = scoreFn;
            }
            if (activeCols(L1Output))
            {
                ValueGetter <double> l1Fn =
                    (ref double dst) =>
                {
                    updateCacheIfNeeded();
                    dst = VectorUtils.L1Distance(ref label, ref score);
                };
                getters[L1Output] = l1Fn;
            }
            if (activeCols(L2Output))
            {
                ValueGetter <double> l2Fn =
                    (ref double dst) =>
                {
                    updateCacheIfNeeded();
                    dst = VectorUtils.L2DistSquared(ref label, ref score);
                };
                getters[L2Output] = l2Fn;
            }
            if (activeCols(DistCol))
            {
                ValueGetter <double> distFn =
                    (ref double dst) =>
                {
                    updateCacheIfNeeded();
                    dst = MathUtils.Sqrt(VectorUtils.L2DistSquared(ref label, ref score));
                };
                getters[DistCol] = distFn;
            }
            return(getters);
        }
        /// <summary>
        /// This is for the indicator (non-bagging) case - vector input and outputs should be concatenated.
        /// </summary>
        private ValueGetter <VBuffer <Float> > MakeGetterInd(IRow input, int iinfo)
        {
            Host.AssertValue(input);
            Host.Assert(Infos[iinfo].TypeSrc.IsVector);
            Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey);
            Host.Assert(!_bag[iinfo]);

            var info = Infos[iinfo];
            int size = info.TypeSrc.ItemType.KeyCount;

            Host.Assert(size > 0);

            int cv = info.TypeSrc.VectorSize;

            Host.Assert(cv >= 0);
            Host.Assert(_types[iinfo].VectorSize == size * cv);

            var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, info.Source);
            var src    = default(VBuffer <uint>);

            return
                ((ref VBuffer <Float> dst) =>
            {
                getSrc(ref src);
                int lenSrc = src.Length;
                Host.Check(lenSrc == cv || cv == 0);

                // Since we generate values in order, no need for a builder.
                var valuesDst = dst.Values;
                var indicesDst = dst.Indices;

                int lenDst = checked (size * lenSrc);
                int cntSrc = src.Count;
                if (Utils.Size(valuesDst) < cntSrc)
                {
                    valuesDst = new Float[cntSrc];
                }
                if (Utils.Size(indicesDst) < cntSrc)
                {
                    indicesDst = new int[cntSrc];
                }

                var values = src.Values;
                int count = 0;
                if (src.IsDense)
                {
                    Host.Assert(lenSrc == cntSrc);
                    for (int slot = 0; slot < cntSrc; slot++)
                    {
                        Host.Assert(count < cntSrc);
                        uint key = values[slot] - 1;
                        if (key >= (uint)size)
                        {
                            continue;
                        }
                        valuesDst[count] = 1;
                        indicesDst[count++] = slot * size + (int)key;
                    }
                }
                else
                {
                    var indices = src.Indices;
                    for (int islot = 0; islot < cntSrc; islot++)
                    {
                        Host.Assert(count < cntSrc);
                        uint key = values[islot] - 1;
                        if (key >= (uint)size)
                        {
                            continue;
                        }
                        valuesDst[count] = 1;
                        indicesDst[count++] = indices[islot] * size + (int)key;
                    }
                }
                dst = new VBuffer <Float>(lenDst, count, valuesDst, indicesDst);
            });
        }
        protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer)
        {
            Host.AssertValueOrNull(ch);
            Host.AssertValue(input);
            Host.Assert(0 <= iinfo && iinfo < Infos.Length);
            Host.Assert(Infos[iinfo].TypeSrc.IsVector);
            Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey);

            disposer = null;

            var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, Infos[iinfo].Source);
            var src    = default(VBuffer <uint>);
            var bldr   = new NgramBufferBuilder(_exes[iinfo].NgramLength, _exes[iinfo].SkipLength,
                                                _ngramMaps[iinfo].Count, GetNgramIdFinder(iinfo));
            var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount;

            if (keyCount == 0)
            {
                keyCount = uint.MaxValue;
            }

            ValueGetter <VBuffer <Float> > del;

            switch (_exes[iinfo].Weighting)
            {
            case WeightingCriteria.TfIdf:
                Host.AssertValue(_invDocFreqs[iinfo]);
                del =
                    (ref VBuffer <Float> dst) =>
                {
                    getSrc(ref src);
                    if (!bldr.IsEmpty)
                    {
                        bldr.Reset();
                        bldr.AddNgrams(ref src, 0, keyCount);
                        bldr.GetResult(ref dst);
                        VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = (Float)(v * _invDocFreqs[iinfo][i]));
                    }
                    else
                    {
                        dst = new VBuffer <Float>(0, dst.Values, dst.Indices);
                    }
                };
                break;

            case WeightingCriteria.Idf:
                Host.AssertValue(_invDocFreqs[iinfo]);
                del =
                    (ref VBuffer <Float> dst) =>
                {
                    getSrc(ref src);
                    if (!bldr.IsEmpty)
                    {
                        bldr.Reset();
                        bldr.AddNgrams(ref src, 0, keyCount);
                        bldr.GetResult(ref dst);
                        VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = v >= 1 ? (Float)_invDocFreqs[iinfo][i] : 0);
                    }
                    else
                    {
                        dst = new VBuffer <Float>(0, dst.Values, dst.Indices);
                    }
                };
                break;

            case WeightingCriteria.Tf:
                del =
                    (ref VBuffer <Float> dst) =>
                {
                    getSrc(ref src);
                    if (!bldr.IsEmpty)
                    {
                        bldr.Reset();
                        bldr.AddNgrams(ref src, 0, keyCount);
                        bldr.GetResult(ref dst);
                    }
                    else
                    {
                        dst = new VBuffer <Float>(0, dst.Values, dst.Indices);
                    }
                };
                break;

            default:
                throw Host.Except("Unsupported weighting criteria");
            }

            return(del);
        }
        private SequencePool[] Train(Arguments args, IDataView trainingData, out double[][] invDocFreqs)
        {
            // Contains the maximum number of grams to store in the dictionary, for each level of ngrams,
            // from 1 (in position 0) up to ngramLength (in position ngramLength-1)
            var lims = new int[Infos.Length][];

            for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
            {
                var all         = args.Column[iinfo].AllLengths ?? args.AllLengths;
                var ngramLength = _exes[iinfo].NgramLength;
                var maxNumTerms = Utils.Size(args.Column[iinfo].MaxNumTerms) > 0 ? args.Column[iinfo].MaxNumTerms : args.MaxNumTerms;
                if (!all)
                {
                    Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 ||
                                      Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(args.MaxNumTerms));
                    lims[iinfo] = new int[ngramLength];
                    lims[iinfo][ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[0];
                }
                else
                {
                    Host.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(args.MaxNumTerms));
                    Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(args.MaxNumTerms));
                    var extend = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[maxNumTerms.Length - 1];
                    lims[iinfo] = Utils.BuildArray(ngramLength,
                                                   i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend);
                }
            }

            var helpers = new NgramBufferBuilder[Infos.Length];
            var getters = new ValueGetter <VBuffer <uint> > [Infos.Length];
            var src     = new VBuffer <uint> [Infos.Length];

            // Keep track of how many grams are in the pool for each value of n. Position
            // i in _counts counts how many (i+1)-grams are in the pool for column iinfo.
            var counts    = new int[Infos.Length][];
            var ngramMaps = new SequencePool[Infos.Length];

            bool[] activeInput = new bool[trainingData.Schema.ColumnCount];
            foreach (var info in Infos)
            {
                activeInput[info.Source] = true;
            }
            using (var cursor = trainingData.GetRowCursor(col => activeInput[col]))
                using (var pch = Host.StartProgressChannel("Building n-gram dictionary"))
                {
                    for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                    {
                        Host.Assert(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsKey);
                        var ngramLength = _exes[iinfo].NgramLength;
                        var skipLength  = _exes[iinfo].SkipLength;

                        getters[iinfo]   = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, cursor, Infos[iinfo].Source);
                        src[iinfo]       = default(VBuffer <uint>);
                        counts[iinfo]    = new int[ngramLength];
                        ngramMaps[iinfo] = new SequencePool();

                        // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will
                        // be added (using lims[iinfo]), therefore we set slotLim to the maximum
                        helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize,
                                                                GetNgramIdFinderAdd(counts[iinfo], lims[iinfo], ngramMaps[iinfo], _exes[iinfo].RequireIdf(), Host));
                    }

                    int    cInfoFull = 0;
                    bool[] infoFull  = new bool[Infos.Length];

                    invDocFreqs = new double[Infos.Length][];

                    long   totalDocs = 0;
                    Double rowCount  = trainingData.GetRowCount(true) ?? Double.NaN;
                    var    buffers   = new VBuffer <float> [Infos.Length];
                    pch.SetHeader(new ProgressHeader(new[] { "Total n-grams" }, new[] { "documents" }),
                                  e => e.SetProgress(0, totalDocs, rowCount));
                    while (cInfoFull < Infos.Length && cursor.MoveNext())
                    {
                        totalDocs++;
                        for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                        {
                            getters[iinfo](ref src[iinfo]);
                            var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount;
                            if (keyCount == 0)
                            {
                                keyCount = uint.MaxValue;
                            }
                            if (!infoFull[iinfo])
                            {
                                if (_exes[iinfo].RequireIdf())
                                {
                                    helpers[iinfo].Reset();
                                }

                                helpers[iinfo].AddNgrams(ref src[iinfo], 0, keyCount);
                                if (_exes[iinfo].RequireIdf())
                                {
                                    int totalNgrams = counts[iinfo].Sum();
                                    Utils.EnsureSize(ref invDocFreqs[iinfo], totalNgrams);
                                    helpers[iinfo].GetResult(ref buffers[iinfo]);
                                    foreach (var pair in buffers[iinfo].Items())
                                    {
                                        if (pair.Value >= 1)
                                        {
                                            invDocFreqs[iinfo][pair.Key] += 1;
                                        }
                                    }
                                }
                            }
                            AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]);
                        }
                    }

                    pch.Checkpoint(counts.Sum(c => c.Sum()), totalDocs);
                    for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                    {
                        for (int i = 0; i < Utils.Size(invDocFreqs[iinfo]); i++)
                        {
                            if (invDocFreqs[iinfo][i] != 0)
                            {
                                invDocFreqs[iinfo][i] = Math.Log(totalDocs / invDocFreqs[iinfo][i]);
                            }
                        }
                    }

                    for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                    {
                        AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]);

                        int ngramLength = _exes[iinfo].NgramLength;
                        for (int i = 0; i < ngramLength; i++)
                        {
                            _exes[iinfo].NonEmptyLevels[i] = counts[iinfo][i] > 0;
                        }
                    }

                    return(ngramMaps);
                }
        }