/// <summary> /// This is for the indicator case - vector input and outputs should be concatenated. /// </summary> private ValueGetter <VBuffer <float> > MakeGetterInd(IRow input, int iinfo) { Host.AssertValue(input); Host.Assert(Infos[iinfo].TypeSrc.IsVector); Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey); int cv = Infos[iinfo].TypeSrc.VectorSize; Host.Assert(cv >= 0); var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, Infos[iinfo].Source); var src = default(VBuffer <uint>); var bldr = new BufferBuilder <float>(R4Adder.Instance); int bitsPerKey = _bitsPerKey[iinfo]; return ((ref VBuffer <float> dst) => { getSrc(ref src); Host.Check(src.Length == cv || cv == 0); bldr.Reset(src.Length * bitsPerKey, false); int index = 0; foreach (uint value in src.DenseValues()) { EncodeValueToBinary(bldr, value, bitsPerKey, index * bitsPerKey); index++; } bldr.GetResult(ref dst); Contracts.Assert(dst.Length == src.Length * bitsPerKey); }); }
public SlotCursor(IChannelProvider provider, ISlotCursor cursor, VectorType typeDst) : base(provider, TransposerUtils.GetRowCursorShim(provider, cursor)) { Ch.Assert(Input.Schema.ColumnCount == 1); Ch.Assert(Input.Schema.GetColumnType(0) == cursor.GetSlotType()); Ch.AssertValue(typeDst); _getter = RowCursorUtils.GetVecGetterAs(typeDst.ItemType, Input, 0); _type = typeDst; }
protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) { Host.AssertValueOrNull(ch); Host.AssertValue(input); Host.Assert(0 <= iinfo && iinfo < Infos.Length); disposer = null; var typeSrc = Infos[iinfo].TypeSrc; var typeDst = _exes[iinfo].TypeDst; if (!typeDst.IsVector) { return(RowCursorUtils.GetGetterAs(typeDst, input, Infos[iinfo].Source)); } return(RowCursorUtils.GetVecGetterAs(typeDst.AsVector.ItemType, input, Infos[iinfo].Source)); }
public override void InitializeNextPass(IRow row, RoleMappedSchema schema) { Contracts.Assert(PassNum < 1); Contracts.AssertValue(schema.Label); var score = schema.GetUniqueColumn(MetadataUtils.Const.ScoreValueKind.Score); _labelGetter = RowCursorUtils.GetVecGetterAs <Float>(NumberType.Float, row, schema.Label.Index); _scoreGetter = row.GetGetter <VBuffer <Float> >(score.Index); Contracts.AssertValue(_labelGetter); Contracts.AssertValue(_scoreGetter); if (schema.Weight != null) { _weightGetter = row.GetGetter <Float>(schema.Weight.Index); } }
public static ValueGetter <VBuffer <Single> > GetLabelGetter(ISlotCursor cursor) { var type = cursor.GetSlotType().ItemType; if (type == NumberType.R4) { return(cursor.GetGetter <Single>()); } if (type == NumberType.R8 || type.IsBool) { return(GetVecGetterAs <Single>(NumberType.R4, cursor)); } Contracts.Check(type.IsKey, "Only floating point number, boolean, and key type values can be used as label."); Contracts.Assert(TestGetLabelGetter(type) == null); ulong keyMax = (ulong)type.KeyCount; if (keyMax == 0) { keyMax = ulong.MaxValue; } var getSrc = RowCursorUtils.GetVecGetterAs <ulong>(NumberType.U8, cursor); VBuffer <ulong> src = default(VBuffer <ulong>); return ((ref VBuffer <Single> dst) => { getSrc(ref src); // Unfortunately defaults in one to not translate to defaults of the other, // so this will not be sparsity preserving. Assume a dense output. Single[] vals = dst.Values; Utils.EnsureSize(ref vals, src.Length); foreach (var kv in src.Items(all: true)) { if (0 < kv.Value && kv.Value <= keyMax) { vals[kv.Key] = kv.Value - 1; } else { vals[kv.Key] = Single.NaN; } } dst = new VBuffer <Single>(src.Length, vals, dst.Indices); }); }
/// <summary> /// This is for the bagging case - vector input and outputs should be added. /// </summary> private ValueGetter <VBuffer <Float> > MakeGetterBag(IRow input, int iinfo) { Host.AssertValue(input); Host.Assert(Infos[iinfo].TypeSrc.IsVector); Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey); Host.Assert(_bag[iinfo]); Host.Assert(Infos[iinfo].TypeSrc.ItemType.KeyCount == _types[iinfo].VectorSize); var info = Infos[iinfo]; int size = info.TypeSrc.ItemType.KeyCount; Host.Assert(size > 0); int cv = info.TypeSrc.VectorSize; Host.Assert(cv >= 0); var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, info.Source); var src = default(VBuffer <uint>); var bldr = BufferBuilder <float> .CreateDefault(); return ((ref VBuffer <Float> dst) => { bldr.Reset(size, false); getSrc(ref src); Host.Check(cv == 0 || src.Length == cv); // The indices are irrelevant in the bagging case. var values = src.Values; int count = src.Count; for (int slot = 0; slot < count; slot++) { uint key = values[slot] - 1; if (key < size) { bldr.AddFeature((int)key, 1); } } bldr.GetResult(ref dst); }); }
private Delegate MakeGetter(IChannel ch, IRow input, int iinfo, FinderDecorator decorator = null) { ch.Assert(_bindings.Infos[iinfo].SrcTypes.All(t => t.IsVector && t.ItemType.IsKey)); var info = _bindings.Infos[iinfo]; int srcCount = info.SrcIndices.Length; ValueGetter <VBuffer <uint> >[] getSrc = new ValueGetter <VBuffer <uint> > [srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { getSrc[isrc] = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, info.SrcIndices[isrc]); } var src = default(VBuffer <uint>); var ngramIdFinder = GetNgramIdFinder(iinfo); if (decorator != null) { ngramIdFinder = decorator(iinfo, ngramIdFinder); } var bldr = new NgramBufferBuilder(_exes[iinfo].NgramLength, _exes[iinfo].SkipLength, _bindings.Types[iinfo].ValueCount, ngramIdFinder); var keyCounts = _bindings.Infos[iinfo].SrcTypes.Select( t => t.ItemType.KeyCount > 0 ? (uint)t.ItemType.KeyCount : uint.MaxValue).ToArray(); // REVIEW: Special casing the srcCount==1 case could potentially improve perf. ValueGetter <VBuffer <Float> > del = (ref VBuffer <Float> dst) => { bldr.Reset(); for (int i = 0; i < srcCount; i++) { getSrc[i](ref src); bldr.AddNgrams(ref src, i, keyCounts[i]); } bldr.GetResult(ref dst); }; return(del); }
public override Delegate[] CreateGetters(IRow input, Func <int, bool> activeCols, out Action disposer) { Host.Assert(LabelIndex >= 0); Host.Assert(ScoreIndex >= 0); disposer = null; long cachedPosition = -1; var label = default(VBuffer <Float>); var score = default(VBuffer <Float>); ValueGetter <VBuffer <Float> > nullGetter = (ref VBuffer <Float> vec) => vec = default(VBuffer <Float>); var labelGetter = activeCols(LabelOutput) || activeCols(L1Output) || activeCols(L2Output) || activeCols(DistCol) ? RowCursorUtils.GetVecGetterAs <Float>(NumberType.Float, input, LabelIndex) : nullGetter; var scoreGetter = activeCols(ScoreOutput) || activeCols(L1Output) || activeCols(L2Output) || activeCols(DistCol) ? input.GetGetter <VBuffer <Float> >(ScoreIndex) : nullGetter; Action updateCacheIfNeeded = () => { if (cachedPosition != input.Position) { labelGetter(ref label); scoreGetter(ref score); cachedPosition = input.Position; } }; var getters = new Delegate[5]; if (activeCols(LabelOutput)) { ValueGetter <VBuffer <Float> > labelFn = (ref VBuffer <Float> dst) => { updateCacheIfNeeded(); label.CopyTo(ref dst); }; getters[LabelOutput] = labelFn; } if (activeCols(ScoreOutput)) { ValueGetter <VBuffer <Float> > scoreFn = (ref VBuffer <Float> dst) => { updateCacheIfNeeded(); score.CopyTo(ref dst); }; getters[ScoreOutput] = scoreFn; } if (activeCols(L1Output)) { ValueGetter <double> l1Fn = (ref double dst) => { updateCacheIfNeeded(); dst = VectorUtils.L1Distance(ref label, ref score); }; getters[L1Output] = l1Fn; } if (activeCols(L2Output)) { ValueGetter <double> l2Fn = (ref double dst) => { updateCacheIfNeeded(); dst = VectorUtils.L2DistSquared(ref label, ref score); }; getters[L2Output] = l2Fn; } if (activeCols(DistCol)) { ValueGetter <double> distFn = (ref double dst) => { updateCacheIfNeeded(); dst = MathUtils.Sqrt(VectorUtils.L2DistSquared(ref label, ref score)); }; getters[DistCol] = distFn; } return(getters); }
/// <summary> /// This is for the indicator (non-bagging) case - vector input and outputs should be concatenated. /// </summary> private ValueGetter <VBuffer <Float> > MakeGetterInd(IRow input, int iinfo) { Host.AssertValue(input); Host.Assert(Infos[iinfo].TypeSrc.IsVector); Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey); Host.Assert(!_bag[iinfo]); var info = Infos[iinfo]; int size = info.TypeSrc.ItemType.KeyCount; Host.Assert(size > 0); int cv = info.TypeSrc.VectorSize; Host.Assert(cv >= 0); Host.Assert(_types[iinfo].VectorSize == size * cv); var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, info.Source); var src = default(VBuffer <uint>); return ((ref VBuffer <Float> dst) => { getSrc(ref src); int lenSrc = src.Length; Host.Check(lenSrc == cv || cv == 0); // Since we generate values in order, no need for a builder. var valuesDst = dst.Values; var indicesDst = dst.Indices; int lenDst = checked (size * lenSrc); int cntSrc = src.Count; if (Utils.Size(valuesDst) < cntSrc) { valuesDst = new Float[cntSrc]; } if (Utils.Size(indicesDst) < cntSrc) { indicesDst = new int[cntSrc]; } var values = src.Values; int count = 0; if (src.IsDense) { Host.Assert(lenSrc == cntSrc); for (int slot = 0; slot < cntSrc; slot++) { Host.Assert(count < cntSrc); uint key = values[slot] - 1; if (key >= (uint)size) { continue; } valuesDst[count] = 1; indicesDst[count++] = slot * size + (int)key; } } else { var indices = src.Indices; for (int islot = 0; islot < cntSrc; islot++) { Host.Assert(count < cntSrc); uint key = values[islot] - 1; if (key >= (uint)size) { continue; } valuesDst[count] = 1; indicesDst[count++] = indices[islot] * size + (int)key; } } dst = new VBuffer <Float>(lenDst, count, valuesDst, indicesDst); }); }
protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) { Host.AssertValueOrNull(ch); Host.AssertValue(input); Host.Assert(0 <= iinfo && iinfo < Infos.Length); Host.Assert(Infos[iinfo].TypeSrc.IsVector); Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey); disposer = null; var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, Infos[iinfo].Source); var src = default(VBuffer <uint>); var bldr = new NgramBufferBuilder(_exes[iinfo].NgramLength, _exes[iinfo].SkipLength, _ngramMaps[iinfo].Count, GetNgramIdFinder(iinfo)); var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount; if (keyCount == 0) { keyCount = uint.MaxValue; } ValueGetter <VBuffer <Float> > del; switch (_exes[iinfo].Weighting) { case WeightingCriteria.TfIdf: Host.AssertValue(_invDocFreqs[iinfo]); del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(ref src, 0, keyCount); bldr.GetResult(ref dst); VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = (Float)(v * _invDocFreqs[iinfo][i])); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; case WeightingCriteria.Idf: Host.AssertValue(_invDocFreqs[iinfo]); del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(ref src, 0, keyCount); bldr.GetResult(ref dst); VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = v >= 1 ? (Float)_invDocFreqs[iinfo][i] : 0); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; case WeightingCriteria.Tf: del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(ref src, 0, keyCount); bldr.GetResult(ref dst); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; default: throw Host.Except("Unsupported weighting criteria"); } return(del); }
private SequencePool[] Train(Arguments args, IDataView trainingData, out double[][] invDocFreqs) { // Contains the maximum number of grams to store in the dictionary, for each level of ngrams, // from 1 (in position 0) up to ngramLength (in position ngramLength-1) var lims = new int[Infos.Length][]; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { var all = args.Column[iinfo].AllLengths ?? args.AllLengths; var ngramLength = _exes[iinfo].NgramLength; var maxNumTerms = Utils.Size(args.Column[iinfo].MaxNumTerms) > 0 ? args.Column[iinfo].MaxNumTerms : args.MaxNumTerms; if (!all) { Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(args.MaxNumTerms)); lims[iinfo] = new int[ngramLength]; lims[iinfo][ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[0]; } else { Host.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(args.MaxNumTerms)); Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(args.MaxNumTerms)); var extend = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[maxNumTerms.Length - 1]; lims[iinfo] = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend); } } var helpers = new NgramBufferBuilder[Infos.Length]; var getters = new ValueGetter <VBuffer <uint> > [Infos.Length]; var src = new VBuffer <uint> [Infos.Length]; // Keep track of how many grams are in the pool for each value of n. Position // i in _counts counts how many (i+1)-grams are in the pool for column iinfo. var counts = new int[Infos.Length][]; var ngramMaps = new SequencePool[Infos.Length]; bool[] activeInput = new bool[trainingData.Schema.ColumnCount]; foreach (var info in Infos) { activeInput[info.Source] = true; } using (var cursor = trainingData.GetRowCursor(col => activeInput[col])) using (var pch = Host.StartProgressChannel("Building n-gram dictionary")) { for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { Host.Assert(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsKey); var ngramLength = _exes[iinfo].NgramLength; var skipLength = _exes[iinfo].SkipLength; getters[iinfo] = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, cursor, Infos[iinfo].Source); src[iinfo] = default(VBuffer <uint>); counts[iinfo] = new int[ngramLength]; ngramMaps[iinfo] = new SequencePool(); // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will // be added (using lims[iinfo]), therefore we set slotLim to the maximum helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize, GetNgramIdFinderAdd(counts[iinfo], lims[iinfo], ngramMaps[iinfo], _exes[iinfo].RequireIdf(), Host)); } int cInfoFull = 0; bool[] infoFull = new bool[Infos.Length]; invDocFreqs = new double[Infos.Length][]; long totalDocs = 0; Double rowCount = trainingData.GetRowCount(true) ?? Double.NaN; var buffers = new VBuffer <float> [Infos.Length]; pch.SetHeader(new ProgressHeader(new[] { "Total n-grams" }, new[] { "documents" }), e => e.SetProgress(0, totalDocs, rowCount)); while (cInfoFull < Infos.Length && cursor.MoveNext()) { totalDocs++; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { getters[iinfo](ref src[iinfo]); var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount; if (keyCount == 0) { keyCount = uint.MaxValue; } if (!infoFull[iinfo]) { if (_exes[iinfo].RequireIdf()) { helpers[iinfo].Reset(); } helpers[iinfo].AddNgrams(ref src[iinfo], 0, keyCount); if (_exes[iinfo].RequireIdf()) { int totalNgrams = counts[iinfo].Sum(); Utils.EnsureSize(ref invDocFreqs[iinfo], totalNgrams); helpers[iinfo].GetResult(ref buffers[iinfo]); foreach (var pair in buffers[iinfo].Items()) { if (pair.Value >= 1) { invDocFreqs[iinfo][pair.Key] += 1; } } } } AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); } } pch.Checkpoint(counts.Sum(c => c.Sum()), totalDocs); for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { for (int i = 0; i < Utils.Size(invDocFreqs[iinfo]); i++) { if (invDocFreqs[iinfo][i] != 0) { invDocFreqs[iinfo][i] = Math.Log(totalDocs / invDocFreqs[iinfo][i]); } } } for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); int ngramLength = _exes[iinfo].NgramLength; for (int i = 0; i < ngramLength; i++) { _exes[iinfo].NonEmptyLevels[i] = counts[iinfo][i] > 0; } } return(ngramMaps); } }